diff --git a/pr_agent/algo/file_filter.py b/pr_agent/algo/file_filter.py new file mode 100644 index 00000000..3dc78c64 --- /dev/null +++ b/pr_agent/algo/file_filter.py @@ -0,0 +1,23 @@ +import fnmatch +import re + +from pr_agent.config_loader import get_settings + +def filter_ignored(files): + """ + Filter out files that match the ignore patterns. + """ + + # load regex patterns, and translate glob patterns to regex + patterns = get_settings().ignore.regex + patterns += [fnmatch.translate(glob) for glob in get_settings().ignore.glob] + + compiled_patterns = [re.compile(r) for r in patterns] + filenames = [file.filename for file in files] + + # keep filenames that don't match the ignore regex + for r in compiled_patterns: + filenames = [f for f in filenames if not r.match(f)] + + # map filenames back to files + return [file for file in files if file.filename in filenames] diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 4d717202..4327a0f1 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -11,6 +11,7 @@ from github import RateLimitExceededException from pr_agent.algo import MAX_TOKENS from pr_agent.algo.git_patch_processing import convert_to_hunks_with_lines_numbers, extend_patch, handle_patch_deletions from pr_agent.algo.language_handler import sort_files_by_main_languages +from pr_agent.algo.file_filter import filter_ignored from pr_agent.algo.token_handler import TokenHandler, get_token_encoder from pr_agent.config_loader import get_settings from pr_agent.git_providers.git_provider import FilePatchInfo, GitProvider @@ -53,6 +54,8 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s logging.error(f"Rate limit exceeded for git provider API. original message {e}") raise + diff_files = filter_ignored(diff_files) + # get pr languages pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) @@ -348,16 +351,16 @@ def get_pr_multi_diffs(git_provider: GitProvider, """ Retrieves the diff files from a Git provider, sorts them by main language, and generates patches for each file. The patches are split into multiple groups based on the maximum number of tokens allowed for the given model. - + Args: git_provider (GitProvider): An object that provides access to Git provider APIs. token_handler (TokenHandler): An object that handles tokens in the context of a pull request. model (str): The name of the model. max_calls (int, optional): The maximum number of calls to retrieve diff files. Defaults to 5. - + Returns: List[str]: A list of final diff strings, split into multiple groups based on the maximum number of tokens allowed for the given model. - + Raises: RateLimitExceededException: If the rate limit for the Git provider API is exceeded. """ @@ -367,6 +370,8 @@ def get_pr_multi_diffs(git_provider: GitProvider, logging.error(f"Rate limit exceeded for git provider API. original message {e}") raise + diff_files = filter_ignored(diff_files) + # Sort files by main language pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) diff --git a/pr_agent/config_loader.py b/pr_agent/config_loader.py index 184adb82..80e091b8 100644 --- a/pr_agent/config_loader.py +++ b/pr_agent/config_loader.py @@ -12,18 +12,19 @@ global_settings = Dynaconf( envvar_prefix=False, merge_enabled=True, settings_files=[join(current_dir, f) for f in [ + "settings_prod/.secrets.toml" "settings/.secrets.toml", "settings/configuration.toml", + "settings/ignore.toml", "settings/language_extensions.toml", - "settings/pr_reviewer_prompts.toml", - "settings/pr_questions_prompts.toml", - "settings/pr_description_prompts.toml", - "settings/pr_code_suggestions_prompts.toml", - "settings/pr_sort_code_suggestions_prompts.toml", - "settings/pr_information_from_user_prompts.toml", - "settings/pr_update_changelog_prompts.toml", "settings/pr_add_docs.toml", - "settings_prod/.secrets.toml" + "settings/pr_code_suggestions_prompts.toml", + "settings/pr_description_prompts.toml", + "settings/pr_information_from_user_prompts.toml", + "settings/pr_questions_prompts.toml", + "settings/pr_reviewer_prompts.toml", + "settings/pr_sort_code_suggestions_prompts.toml", + "settings/pr_update_changelog_prompts.toml", ]] ) diff --git a/pr_agent/settings/ignore.toml b/pr_agent/settings/ignore.toml new file mode 100644 index 00000000..a59b810b --- /dev/null +++ b/pr_agent/settings/ignore.toml @@ -0,0 +1,5 @@ +[ignore] + +# Ignore files and directories matching these patterns. +glob = [] +regex = [] diff --git a/tests/unittest/test_file_filter.py b/tests/unittest/test_file_filter.py new file mode 100644 index 00000000..4856fbb4 --- /dev/null +++ b/tests/unittest/test_file_filter.py @@ -0,0 +1,59 @@ +import pytest +from pr_agent.algo.file_filter import filter_ignored +from pr_agent.config_loader import global_settings + +class TestIgnoreFilter: + def test_no_ignores(self): + """ + Test no files are ignored when no patterns are specified. + """ + files = [ + type('', (object,), {'filename': 'file1.py'})(), + type('', (object,), {'filename': 'file2.java'})(), + type('', (object,), {'filename': 'file3.cpp'})(), + type('', (object,), {'filename': 'file4.py'})(), + type('', (object,), {'filename': 'file5.py'})() + ] + assert filter_ignored(files) == files + + def test_glob_ignores(self, monkeypatch): + """ + Test files are ignored when glob patterns are specified. + """ + monkeypatch.setattr(global_settings.ignore, 'glob', ['*.py']) + + files = [ + type('', (object,), {'filename': 'file1.py'})(), + type('', (object,), {'filename': 'file2.java'})(), + type('', (object,), {'filename': 'file3.cpp'})(), + type('', (object,), {'filename': 'file4.py'})(), + type('', (object,), {'filename': 'file5.py'})() + ] + expected = [ + files[1], + files[2] + ] + + filtered_files = filter_ignored(files) + assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}." + + def test_regex_ignores(self, monkeypatch): + """ + Test files are ignored when regex patterns are specified. + """ + monkeypatch.setattr(global_settings.ignore, 'regex', ['^file[2-4]\..*$']) + + files = [ + type('', (object,), {'filename': 'file1.py'})(), + type('', (object,), {'filename': 'file2.java'})(), + type('', (object,), {'filename': 'file3.cpp'})(), + type('', (object,), {'filename': 'file4.py'})(), + type('', (object,), {'filename': 'file5.py'})() + ] + expected = [ + files[0], + files[4] + ] + + filtered_files = filter_ignored(files) + assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}."