From e387086890a64d13c5d0c378c01f80f078d31181 Mon Sep 17 00:00:00 2001 From: jamesrom Date: Fri, 6 Oct 2023 01:43:35 +1100 Subject: [PATCH 1/6] Add support for ignoring files Add ignore.toml, configuration for ignoring files Add file_filter.py, for matching files against glob/regex patterns Update relevant code to use file filter +Tests --- pr_agent/algo/file_filter.py | 23 ++++++++++++ pr_agent/algo/pr_processing.py | 11 ++++-- pr_agent/config_loader.py | 17 +++++---- pr_agent/settings/ignore.toml | 5 +++ tests/unittest/test_file_filter.py | 59 ++++++++++++++++++++++++++++++ 5 files changed, 104 insertions(+), 11 deletions(-) create mode 100644 pr_agent/algo/file_filter.py create mode 100644 pr_agent/settings/ignore.toml create mode 100644 tests/unittest/test_file_filter.py diff --git a/pr_agent/algo/file_filter.py b/pr_agent/algo/file_filter.py new file mode 100644 index 00000000..3dc78c64 --- /dev/null +++ b/pr_agent/algo/file_filter.py @@ -0,0 +1,23 @@ +import fnmatch +import re + +from pr_agent.config_loader import get_settings + +def filter_ignored(files): + """ + Filter out files that match the ignore patterns. + """ + + # load regex patterns, and translate glob patterns to regex + patterns = get_settings().ignore.regex + patterns += [fnmatch.translate(glob) for glob in get_settings().ignore.glob] + + compiled_patterns = [re.compile(r) for r in patterns] + filenames = [file.filename for file in files] + + # keep filenames that don't match the ignore regex + for r in compiled_patterns: + filenames = [f for f in filenames if not r.match(f)] + + # map filenames back to files + return [file for file in files if file.filename in filenames] diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 4d717202..4327a0f1 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -11,6 +11,7 @@ from github import RateLimitExceededException from pr_agent.algo import MAX_TOKENS from pr_agent.algo.git_patch_processing import convert_to_hunks_with_lines_numbers, extend_patch, handle_patch_deletions from pr_agent.algo.language_handler import sort_files_by_main_languages +from pr_agent.algo.file_filter import filter_ignored from pr_agent.algo.token_handler import TokenHandler, get_token_encoder from pr_agent.config_loader import get_settings from pr_agent.git_providers.git_provider import FilePatchInfo, GitProvider @@ -53,6 +54,8 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s logging.error(f"Rate limit exceeded for git provider API. original message {e}") raise + diff_files = filter_ignored(diff_files) + # get pr languages pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) @@ -348,16 +351,16 @@ def get_pr_multi_diffs(git_provider: GitProvider, """ Retrieves the diff files from a Git provider, sorts them by main language, and generates patches for each file. The patches are split into multiple groups based on the maximum number of tokens allowed for the given model. - + Args: git_provider (GitProvider): An object that provides access to Git provider APIs. token_handler (TokenHandler): An object that handles tokens in the context of a pull request. model (str): The name of the model. max_calls (int, optional): The maximum number of calls to retrieve diff files. Defaults to 5. - + Returns: List[str]: A list of final diff strings, split into multiple groups based on the maximum number of tokens allowed for the given model. - + Raises: RateLimitExceededException: If the rate limit for the Git provider API is exceeded. """ @@ -367,6 +370,8 @@ def get_pr_multi_diffs(git_provider: GitProvider, logging.error(f"Rate limit exceeded for git provider API. original message {e}") raise + diff_files = filter_ignored(diff_files) + # Sort files by main language pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) diff --git a/pr_agent/config_loader.py b/pr_agent/config_loader.py index 184adb82..80e091b8 100644 --- a/pr_agent/config_loader.py +++ b/pr_agent/config_loader.py @@ -12,18 +12,19 @@ global_settings = Dynaconf( envvar_prefix=False, merge_enabled=True, settings_files=[join(current_dir, f) for f in [ + "settings_prod/.secrets.toml" "settings/.secrets.toml", "settings/configuration.toml", + "settings/ignore.toml", "settings/language_extensions.toml", - "settings/pr_reviewer_prompts.toml", - "settings/pr_questions_prompts.toml", - "settings/pr_description_prompts.toml", - "settings/pr_code_suggestions_prompts.toml", - "settings/pr_sort_code_suggestions_prompts.toml", - "settings/pr_information_from_user_prompts.toml", - "settings/pr_update_changelog_prompts.toml", "settings/pr_add_docs.toml", - "settings_prod/.secrets.toml" + "settings/pr_code_suggestions_prompts.toml", + "settings/pr_description_prompts.toml", + "settings/pr_information_from_user_prompts.toml", + "settings/pr_questions_prompts.toml", + "settings/pr_reviewer_prompts.toml", + "settings/pr_sort_code_suggestions_prompts.toml", + "settings/pr_update_changelog_prompts.toml", ]] ) diff --git a/pr_agent/settings/ignore.toml b/pr_agent/settings/ignore.toml new file mode 100644 index 00000000..a59b810b --- /dev/null +++ b/pr_agent/settings/ignore.toml @@ -0,0 +1,5 @@ +[ignore] + +# Ignore files and directories matching these patterns. +glob = [] +regex = [] diff --git a/tests/unittest/test_file_filter.py b/tests/unittest/test_file_filter.py new file mode 100644 index 00000000..4856fbb4 --- /dev/null +++ b/tests/unittest/test_file_filter.py @@ -0,0 +1,59 @@ +import pytest +from pr_agent.algo.file_filter import filter_ignored +from pr_agent.config_loader import global_settings + +class TestIgnoreFilter: + def test_no_ignores(self): + """ + Test no files are ignored when no patterns are specified. + """ + files = [ + type('', (object,), {'filename': 'file1.py'})(), + type('', (object,), {'filename': 'file2.java'})(), + type('', (object,), {'filename': 'file3.cpp'})(), + type('', (object,), {'filename': 'file4.py'})(), + type('', (object,), {'filename': 'file5.py'})() + ] + assert filter_ignored(files) == files + + def test_glob_ignores(self, monkeypatch): + """ + Test files are ignored when glob patterns are specified. + """ + monkeypatch.setattr(global_settings.ignore, 'glob', ['*.py']) + + files = [ + type('', (object,), {'filename': 'file1.py'})(), + type('', (object,), {'filename': 'file2.java'})(), + type('', (object,), {'filename': 'file3.cpp'})(), + type('', (object,), {'filename': 'file4.py'})(), + type('', (object,), {'filename': 'file5.py'})() + ] + expected = [ + files[1], + files[2] + ] + + filtered_files = filter_ignored(files) + assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}." + + def test_regex_ignores(self, monkeypatch): + """ + Test files are ignored when regex patterns are specified. + """ + monkeypatch.setattr(global_settings.ignore, 'regex', ['^file[2-4]\..*$']) + + files = [ + type('', (object,), {'filename': 'file1.py'})(), + type('', (object,), {'filename': 'file2.java'})(), + type('', (object,), {'filename': 'file3.cpp'})(), + type('', (object,), {'filename': 'file4.py'})(), + type('', (object,), {'filename': 'file5.py'})() + ] + expected = [ + files[0], + files[4] + ] + + filtered_files = filter_ignored(files) + assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}." From b27f57d05d28d7b63434a60ee4472df4cfc4c2cf Mon Sep 17 00:00:00 2001 From: jamesrom Date: Fri, 6 Oct 2023 21:03:36 +1100 Subject: [PATCH 2/6] Update settings, documentation --- INSTALL.md | 26 +++++++++++++------------- pr_agent/config_loader.py | 12 ++++++------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index f4247ce2..492dc25c 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,7 +40,7 @@ For other git providers, update CONFIG.GIT_PROVIDER accordingly, and check the ` ``` docker run --rm -it -e OPENAI.KEY= -e GITHUB.USER_TOKEN= codiumai/pr-agent --pr_url ask "" ``` -Note: If you want to ensure you're running a specific version of the Docker image, consider using the image's digest. +Note: If you want to ensure you're running a specific version of the Docker image, consider using the image's digest. The digest is a unique identifier for a specific version of an image. You can pull and run an image using its digest by referencing it like so: repository@sha256:digest. Always ensure you're using the correct and trusted digest for your operations. 1. To request a review for a PR using a specific digest, run the following command: @@ -89,17 +89,17 @@ chmod 600 pr_agent/settings/.secrets.toml ``` export PYTHONPATH=[$PYTHONPATH:] -python3 -m pr_agent.cli --pr_url /review -python3 -m pr_agent.cli --pr_url /ask -python3 -m pr_agent.cli --pr_url /describe -python3 -m pr_agent.cli --pr_url /improve +python3 -m pr_agent.cli --pr_url review +python3 -m pr_agent.cli --pr_url ask +python3 -m pr_agent.cli --pr_url describe +python3 -m pr_agent.cli --pr_url improve ``` --- ### Method 3: Run as a GitHub Action -You can use our pre-built Github Action Docker image to run PR-Agent as a Github Action. +You can use our pre-built Github Action Docker image to run PR-Agent as a Github Action. 1. Add the following file to your repository under `.github/workflows/pr_agent.yml`: @@ -153,7 +153,7 @@ OPENAI_KEY: The GITHUB_TOKEN secret is automatically created by GitHub. -3. Merge this change to your main branch. +3. Merge this change to your main branch. When you open your next PR, you should see a comment from `github-actions` bot with a review of your PR, and instructions on how to use the rest of the tools. 4. You may configure PR-Agent by adding environment variables under the env section corresponding to any configurable property in the [configuration](pr_agent/settings/configuration.toml) file. Some examples: @@ -221,12 +221,12 @@ git clone https://github.com/Codium-ai/pr-agent.git - Copy your app's webhook secret to the webhook_secret field. - Set deployment_type to 'app' in [configuration.toml](./pr_agent/settings/configuration.toml) -> The .secrets.toml file is not copied to the Docker image by default, and is only used for local development. +> The .secrets.toml file is not copied to the Docker image by default, and is only used for local development. > If you want to use the .secrets.toml file in your Docker image, you can add remove it from the .dockerignore file. -> In most production environments, you would inject the secrets file as environment variables or as mounted volumes. +> In most production environments, you would inject the secrets file as environment variables or as mounted volumes. > For example, in order to inject a secrets file as a volume in a Kubernetes environment you can update your pod spec to include the following, > assuming you have a secret named `pr-agent-settings` with a key named `.secrets.toml`: -``` +``` volumes: - name: settings-volume secret: @@ -322,7 +322,7 @@ Example IAM permissions to that user to allow access to CodeCommit: "codecommit:PostComment*", "codecommit:PutCommentReaction", "codecommit:UpdatePullRequestDescription", - "codecommit:UpdatePullRequestTitle" + "codecommit:UpdatePullRequestTitle" ], "Resource": "*" } @@ -366,8 +366,8 @@ WEBHOOK_SECRET=$(python -c "import secrets; print(secrets.token_hex(10))") - Your OpenAI key. - In the [gitlab] section, fill in personal_access_token and shared_secret. The access token can be a personal access token, or a group or project access token. - Set deployment_type to 'gitlab' in [configuration.toml](./pr_agent/settings/configuration.toml) -5. Create a webhook in GitLab. Set the URL to the URL of your app's server. Set the secret token to the generated secret from step 2. -In the "Trigger" section, check the ‘comments’ and ‘merge request events’ boxes. +5. Create a webhook in GitLab. Set the URL to the URL of your app's server. Set the secret token to the generated secret from step 2. +In the "Trigger" section, check the ‘comments’ and ‘merge request events’ boxes. 6. Test your installation by opening a merge request or commenting or a merge request using one of CodiumAI's commands. diff --git a/pr_agent/config_loader.py b/pr_agent/config_loader.py index 80e091b8..3b0b0360 100644 --- a/pr_agent/config_loader.py +++ b/pr_agent/config_loader.py @@ -12,19 +12,19 @@ global_settings = Dynaconf( envvar_prefix=False, merge_enabled=True, settings_files=[join(current_dir, f) for f in [ - "settings_prod/.secrets.toml" "settings/.secrets.toml", "settings/configuration.toml", "settings/ignore.toml", "settings/language_extensions.toml", - "settings/pr_add_docs.toml", - "settings/pr_code_suggestions_prompts.toml", - "settings/pr_description_prompts.toml", - "settings/pr_information_from_user_prompts.toml", - "settings/pr_questions_prompts.toml", "settings/pr_reviewer_prompts.toml", + "settings/pr_questions_prompts.toml", + "settings/pr_description_prompts.toml", + "settings/pr_code_suggestions_prompts.toml", "settings/pr_sort_code_suggestions_prompts.toml", + "settings/pr_information_from_user_prompts.toml", "settings/pr_update_changelog_prompts.toml", + "settings/pr_add_docs.toml", + "settings_prod/.secrets.toml" ]] ) From baa0e95227fc985cc6f4f28fbd088acb20a1f3fa Mon Sep 17 00:00:00 2001 From: jamesrom Date: Fri, 6 Oct 2023 21:53:10 +1100 Subject: [PATCH 3/6] Code comments for ignore.toml --- pr_agent/settings/ignore.toml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pr_agent/settings/ignore.toml b/pr_agent/settings/ignore.toml index a59b810b..429d3887 100644 --- a/pr_agent/settings/ignore.toml +++ b/pr_agent/settings/ignore.toml @@ -1,5 +1,11 @@ [ignore] -# Ignore files and directories matching these patterns. -glob = [] -regex = [] +glob = [ + # Ignore files and directories matching these glob patterns. + # See https://docs.python.org/3/library/glob.html + 'vendor/**', +] +regex = [ + # Ignore files and directories matching these regex patterns. + # See https://learnbyexample.github.io/python-regex-cheatsheet/ +] From 6dee18b24a1c34a2aac3e32572e584dd95dd43e5 Mon Sep 17 00:00:00 2001 From: jamesrom Date: Fri, 6 Oct 2023 22:13:03 +1100 Subject: [PATCH 4/6] Update usage documentation --- Usage.md | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/Usage.md b/Usage.md index 9f4af800..4a3855e4 100644 --- a/Usage.md +++ b/Usage.md @@ -29,6 +29,16 @@ In addition to general configuration options, each tool has its own configuratio The [Tools Guide](./docs/TOOLS_GUIDE.md) provides a detailed description of the different tools and their configurations. +#### Ignoring files from analysis +In some cases, you may want to exclude specific files or directories from the analysis performed by CodiumAI PR-Agent. This can be useful, for example, when you have files that are generated automatically or files that shouldn't be reviewed, like vendored code. + +To ignore files or directories, edit the **[ignore.toml](/pr_agent/settings/ignore.toml)** configuration file. This setting is also exposed the following environment variables: + + - `IGNORE.GLOB` + - `IGNORE.REGEX` + +See [dynaconf envvars documentation](https://www.dynaconf.com/envvars/). + #### git provider The [git_provider](pr_agent/settings/configuration.toml#L4) field in the configuration file determines the GIT provider that will be used by PR-Agent. Currently, the following providers are supported: ` @@ -101,7 +111,7 @@ Any configuration value in [configuration file](pr_agent/settings/configuration. When running PR-Agent from [GitHub App](INSTALL.md#method-5-run-as-a-github-app), the default configurations from a pre-built docker will be initially loaded. #### GitHub app automatic tools -The [github_app](pr_agent/settings/configuration.toml#L56) section defines GitHub app specific configurations. +The [github_app](pr_agent/settings/configuration.toml#L56) section defines GitHub app specific configurations. An important parameter is `pr_commands`, which is a list of tools that will be **run automatically** when a new PR is opened: ``` [github_app] @@ -133,7 +143,7 @@ Note that a local `.pr_agent.toml` file enables you to edit and customize the de #### Editing the prompts The prompts for the various PR-Agent tools are defined in the `pr_agent/settings` folder. -In practice, the prompts are loaded and stored as a standard setting object. +In practice, the prompts are loaded and stored as a standard setting object. Hence, editing them is similar to editing any other configuration value - just place the relevant key in `.pr_agent.toml`file, and override the default value. For example, if you want to edit the prompts of the [describe](./pr_agent/settings/pr_description_prompts.toml) tool, you can add the following to your `.pr_agent.toml` file: @@ -158,7 +168,7 @@ You can configure settings in GitHub action by adding environment variables unde PR_CODE_SUGGESTIONS.NUM_CODE_SUGGESTIONS: 6 # Increase number of code suggestions github_action.auto_review: "true" # Enable auto review github_action.auto_describe: "true" # Enable auto describe - github_action.auto_improve: "false" # Disable auto improve + github_action.auto_improve: "false" # Disable auto improve ``` specifically, `github_action.auto_review`, `github_action.auto_describe` and `github_action.auto_improve` are used to enable/disable automatic tools that run when a new PR is opened. @@ -171,7 +181,7 @@ To use a different model than the default (GPT-4), you need to edit [configurati For models and environments not from OPENAI, you might need to provide additional keys and other parameters. See below for instructions. #### Azure -To use Azure, set in your .secrets.toml: +To use Azure, set in your .secrets.toml: ``` api_key = "" # your azure api key api_type = "azure" @@ -180,16 +190,16 @@ api_base = "" # The base URL for your Azure OpenAI resource. e.g. "https:// Date: Fri, 6 Oct 2023 22:44:29 +1100 Subject: [PATCH 5/6] Simplify filter --- pr_agent/algo/file_filter.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pr_agent/algo/file_filter.py b/pr_agent/algo/file_filter.py index 3dc78c64..fb40cfda 100644 --- a/pr_agent/algo/file_filter.py +++ b/pr_agent/algo/file_filter.py @@ -10,14 +10,13 @@ def filter_ignored(files): # load regex patterns, and translate glob patterns to regex patterns = get_settings().ignore.regex - patterns += [fnmatch.translate(glob) for glob in get_settings().ignore.glob] + patterns += [fnmatch.translate(glob) for glob in get_settings().ignore.glob] + # compile regex patterns compiled_patterns = [re.compile(r) for r in patterns] - filenames = [file.filename for file in files] - # keep filenames that don't match the ignore regex + # keep filenames that _don't_ match the ignore regex for r in compiled_patterns: - filenames = [f for f in filenames if not r.match(f)] + files = [f for f in files if not r.match(f.filename)] - # map filenames back to files - return [file for file in files if file.filename in filenames] + return files From 92e9012fb63d21f2464f48979c9e642a43e9d6ed Mon Sep 17 00:00:00 2001 From: jamesrom Date: Sat, 7 Oct 2023 09:39:53 +1100 Subject: [PATCH 6/6] Error handling --- pr_agent/algo/file_filter.py | 25 +++++++++++++++++-------- tests/unittest/test_file_filter.py | 23 ++++++++++++++++++++++- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/pr_agent/algo/file_filter.py b/pr_agent/algo/file_filter.py index fb40cfda..cc466f57 100644 --- a/pr_agent/algo/file_filter.py +++ b/pr_agent/algo/file_filter.py @@ -8,15 +8,24 @@ def filter_ignored(files): Filter out files that match the ignore patterns. """ - # load regex patterns, and translate glob patterns to regex - patterns = get_settings().ignore.regex - patterns += [fnmatch.translate(glob) for glob in get_settings().ignore.glob] + try: + # load regex patterns, and translate glob patterns to regex + patterns = get_settings().ignore.regex + patterns += [fnmatch.translate(glob) for glob in get_settings().ignore.glob] - # compile regex patterns - compiled_patterns = [re.compile(r) for r in patterns] + # compile all valid patterns + compiled_patterns = [] + for r in patterns: + try: + compiled_patterns.append(re.compile(r)) + except re.error: + pass - # keep filenames that _don't_ match the ignore regex - for r in compiled_patterns: - files = [f for f in files if not r.match(f.filename)] + # keep filenames that _don't_ match the ignore regex + for r in compiled_patterns: + files = [f for f in files if not r.match(f.filename)] + + except Exception as e: + print(f"Could not filter file list: {e}") return files diff --git a/tests/unittest/test_file_filter.py b/tests/unittest/test_file_filter.py index 4856fbb4..43e9c9b4 100644 --- a/tests/unittest/test_file_filter.py +++ b/tests/unittest/test_file_filter.py @@ -14,7 +14,7 @@ class TestIgnoreFilter: type('', (object,), {'filename': 'file4.py'})(), type('', (object,), {'filename': 'file5.py'})() ] - assert filter_ignored(files) == files + assert filter_ignored(files) == files, "Expected all files to be returned when no ignore patterns are given." def test_glob_ignores(self, monkeypatch): """ @@ -57,3 +57,24 @@ class TestIgnoreFilter: filtered_files = filter_ignored(files) assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}." + + def test_invalid_regex(self, monkeypatch): + """ + Test invalid patterns are quietly ignored. + """ + monkeypatch.setattr(global_settings.ignore, 'regex', ['(((||', '^file[2-4]\..*$']) + + files = [ + type('', (object,), {'filename': 'file1.py'})(), + type('', (object,), {'filename': 'file2.java'})(), + type('', (object,), {'filename': 'file3.cpp'})(), + type('', (object,), {'filename': 'file4.py'})(), + type('', (object,), {'filename': 'file5.py'})() + ] + expected = [ + files[0], + files[4] + ] + + filtered_files = filter_ignored(files) + assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}."