diff --git a/docs/docs/usage-guide/additional_configurations.md b/docs/docs/usage-guide/additional_configurations.md index 4deca211..aba817e4 100644 --- a/docs/docs/usage-guide/additional_configurations.md +++ b/docs/docs/usage-guide/additional_configurations.md @@ -10,12 +10,17 @@ To ignore files or directories, edit the **[ignore.toml](https://github.com/Codi For example, to ignore Python files in a PR with online usage, comment on a PR: `/review --ignore.glob=['*.py']` -To ignore Python files in all PRs, set in a configuration file: +To ignore Python files in all PRs using `glob` pattern, set in a configuration file: ``` [ignore] glob = ['*.py'] ``` +And to ignore Python files in all PRs using `regex` pattern, set in a configuration file: +``` +[regex] +regex = ['.*\.py$'] +``` ## Extra instructions All PR-Agent tools have a parameter called `extra_instructions`, that enables to add free-text extra instructions. Example usage: diff --git a/pr_agent/algo/file_filter.py b/pr_agent/algo/file_filter.py index 9f396549..fa97b283 100644 --- a/pr_agent/algo/file_filter.py +++ b/pr_agent/algo/file_filter.py @@ -4,7 +4,7 @@ import re from pr_agent.config_loader import get_settings -def filter_ignored(files): +def filter_ignored(files, platform = 'github'): """ Filter out files that match the ignore patterns. """ @@ -28,8 +28,16 @@ def filter_ignored(files): pass # keep filenames that _don't_ match the ignore regex - for r in compiled_patterns: - files = [f for f in files if (f.filename and not r.match(f.filename))] + if files and isinstance(files, list): + for r in compiled_patterns: + if platform == 'github': + files = [f for f in files if (f.filename and not r.match(f.filename))] + elif platform == 'bitbucket': + files = [f for f in files if (f.new.path and not r.match(f.new.path))] + elif platform == 'gitlab': + files = [f for f in files if (f['new_path'] and not r.match(f['new_path']))] + elif platform == 'azure': + files = [f for f in files if not r.match(f)] except Exception as e: print(f"Could not filter file list: {e}") diff --git a/pr_agent/algo/language_handler.py b/pr_agent/algo/language_handler.py index b4c02bee..83f7b85e 100644 --- a/pr_agent/algo/language_handler.py +++ b/pr_agent/algo/language_handler.py @@ -15,7 +15,7 @@ def filter_bad_extensions(files): return [f for f in files if f.filename is not None and is_valid_file(f.filename)] -def is_valid_file(filename): +def is_valid_file(filename: str): return filename.split('.')[-1] not in bad_extensions diff --git a/pr_agent/algo/utils.py b/pr_agent/algo/utils.py index e0ea2134..1db3dd53 100644 --- a/pr_agent/algo/utils.py +++ b/pr_agent/algo/utils.py @@ -91,7 +91,7 @@ def convert_to_markdown(output_data: dict, gfm_supported: bool = True, increment markdown_text += f"## Incremental PR Reviewer Guide ๐Ÿ”\n\n" markdown_text += f"โฎ๏ธ Review for commits since previous PR-Agent review {incremental_review}.\n\n" if gfm_supported: - markdown_text += "\n\n" + markdown_text += "
\n" # markdown_text += """""" if not output_data or not output_data.get('review', {}): @@ -108,7 +108,7 @@ def convert_to_markdown(output_data: dict, gfm_supported: bool = True, increment key_nice = 'Estimated effort to review [1-5]' if 'security concerns' in key_nice.lower(): value = emphasize_header(value.strip()) - markdown_text += f"\n" + markdown_text += f"\n" elif 'can be split' in key_nice.lower(): markdown_text += process_can_be_split(emoji, value) elif 'key issues to review' in key_nice.lower(): @@ -124,15 +124,17 @@ def convert_to_markdown(output_data: dict, gfm_supported: bool = True, increment if not issue: continue issue = emphasize_header(issue) + issue = replace_code_tags(issue) if i == 0: - markdown_text += f"\n" + markdown_text += f"\n" else: - markdown_text += f"\n\n" + markdown_text += f"\n\n" else: value = emphasize_header(value.strip('-').strip()) - markdown_text += f"\n" + value = replace_code_tags(value) + markdown_text += f"\n" else: - markdown_text += f"\n" + markdown_text += f"\n" else: if len(value.split()) > 1: markdown_text += f"{emoji} **{key_nice}:**\n\n {value}\n\n" diff --git a/pr_agent/git_providers/azuredevops_provider.py b/pr_agent/git_providers/azuredevops_provider.py index ee3a116f..9578e0fb 100644 --- a/pr_agent/git_providers/azuredevops_provider.py +++ b/pr_agent/git_providers/azuredevops_provider.py @@ -2,6 +2,7 @@ import os from typing import Optional, Tuple from urllib.parse import urlparse +from ..algo.file_filter import filter_ignored from ..log import get_logger from ..algo.language_handler import is_valid_file from ..algo.utils import clip_tokens, find_line_number_of_relevant_line_in_file, load_large_diff @@ -284,8 +285,20 @@ class AzureDevopsProvider(GitProvider): # # diffs = list(set(diffs)) + diffs_original = diffs + diffs = filter_ignored(diffs_original, 'azure') + if diffs_original != diffs: + try: + get_logger().info(f"Filtered out [ignore] files for pull request:", extra= + {"files": diffs_original, # diffs is just a list of names + "filtered_files": diffs}) + except Exception: + pass + + invalid_files_names = [] for file in diffs: if not is_valid_file(file): + invalid_files_names.append(file) continue version = GitVersionDescriptor( @@ -350,6 +363,8 @@ class AzureDevopsProvider(GitProvider): edit_type=edit_type, ) ) + get_logger().info(f"Invalid files: {invalid_files_names}") + self.diff_files = diff_files return diff_files except Exception as e: diff --git a/pr_agent/git_providers/bitbucket_provider.py b/pr_agent/git_providers/bitbucket_provider.py index 27a88a76..4a0ce49a 100644 --- a/pr_agent/git_providers/bitbucket_provider.py +++ b/pr_agent/git_providers/bitbucket_provider.py @@ -7,6 +7,8 @@ from atlassian.bitbucket import Cloud from starlette_context import context from pr_agent.algo.types import FilePatchInfo, EDIT_TYPE +from ..algo.file_filter import filter_ignored +from ..algo.language_handler import is_valid_file from ..algo.utils import find_line_number_of_relevant_line_in_file from ..config_loader import get_settings from ..log import get_logger @@ -122,13 +124,30 @@ class BitbucketProvider(GitProvider): if self.diff_files: return self.diff_files - diffs = self.pr.diffstat() + diffs_original = list(self.pr.diffstat()) + diffs = filter_ignored(diffs_original, 'bitbucket') + if diffs != diffs_original: + try: + names_original = [d.new.path for d in diffs_original] + names_filtered = [d.new.path for d in diffs] + get_logger().info(f"Filtered out [ignore] files for PR", extra={ + 'original_files': names_original, + 'filtered_files': names_filtered + }) + except Exception as e: + pass + diff_split = [ "diff --git%s" % x for x in self.pr.diff().split("diff --git") if x.strip() ] + invalid_files_names = [] diff_files = [] for index, diff in enumerate(diffs): + if not is_valid_file(diff.new.path): + invalid_files_names.append(diff.new.path) + continue + original_file_content_str = self._get_pr_file_content( diff.old.get_data("links") ) @@ -150,6 +169,9 @@ class BitbucketProvider(GitProvider): file_patch_canonic_structure.edit_type = EDIT_TYPE.RENAMED diff_files.append(file_patch_canonic_structure) + if invalid_files_names: + get_logger().info(f"Invalid file names: {invalid_files_names}") + self.diff_files = diff_files return diff_files diff --git a/pr_agent/git_providers/github_provider.py b/pr_agent/git_providers/github_provider.py index 36f15201..5e7601b3 100644 --- a/pr_agent/git_providers/github_provider.py +++ b/pr_agent/git_providers/github_provider.py @@ -8,6 +8,7 @@ from github import AppAuthentication, Auth, Github, GithubException from retry import retry from starlette_context import context +from ..algo.file_filter import filter_ignored from ..algo.language_handler import is_valid_file from ..algo.utils import load_large_diff, clip_tokens, find_line_number_of_relevant_line_in_file from ..config_loader import get_settings @@ -106,19 +107,22 @@ class GithubProvider(GitProvider): git_files = context.get("git_files", None) if git_files: return git_files - self.git_files = self.pr.get_files() + self.git_files = list(self.pr.get_files()) # 'list' to hanlde pagination context["git_files"] = self.git_files return self.git_files except Exception: if not self.git_files: - self.git_files = self.pr.get_files() + self.git_files = list(self.pr.get_files()) return self.git_files def get_num_of_files(self): - if self.git_files: + if hasattr(self.git_files, "totalCount"): return self.git_files.totalCount else: - return -1 + try: + return len(self.git_files) + except Exception as e: + return -1 @retry(exceptions=RateLimitExceeded, tries=get_settings().github.ratelimit_retries, delay=2, backoff=2, jitter=(1, 3)) @@ -142,12 +146,24 @@ class GithubProvider(GitProvider): if self.diff_files: return self.diff_files - files = self.get_files() - diff_files = [] + # filter files using [ignore] patterns + files_original = self.get_files() + files = filter_ignored(files_original) + if files_original != files: + try: + names_original = [file.filename for file in files_original] + names_new = [file.filename for file in files] + get_logger().info(f"Filtered out [ignore] files for pull request:", extra= + {"files": names_original, + "filtered_files": names_new}) + except Exception: + pass + diff_files = [] + invalid_files_names = [] for file in files: if not is_valid_file(file.filename): - get_logger().info(f"Skipping a non-code file: {file.filename}") + invalid_files_names.append(file.filename) continue new_file_content_str = self._get_pr_file_content(file, self.pr.head.sha) # communication with GitHub @@ -183,6 +199,8 @@ class GithubProvider(GitProvider): num_plus_lines=num_plus_lines, num_minus_lines=num_minus_lines,) diff_files.append(file_patch_canonical_structure) + if invalid_files_names: + get_logger().info(f"Filtered out files with invalid extensions: {invalid_files_names}") self.diff_files = diff_files try: diff --git a/pr_agent/git_providers/gitlab_provider.py b/pr_agent/git_providers/gitlab_provider.py index ad18ba1d..766a6743 100644 --- a/pr_agent/git_providers/gitlab_provider.py +++ b/pr_agent/git_providers/gitlab_provider.py @@ -6,6 +6,7 @@ from urllib.parse import urlparse import gitlab from gitlab import GitlabGetError +from ..algo.file_filter import filter_ignored from ..algo.language_handler import is_valid_file from ..algo.utils import load_large_diff, clip_tokens, find_line_number_of_relevant_line_in_file from ..config_loader import get_settings @@ -84,53 +85,71 @@ class GitLabProvider(GitProvider): if self.diff_files: return self.diff_files - diffs = self.mr.changes()['changes'] + # filter files using [ignore] patterns + diffs_original = self.mr.changes()['changes'] + diffs = filter_ignored(diffs_original, 'gitlab') + if diffs != diffs_original: + try: + names_original = [diff['new_path'] for diff in diffs_original] + names_filtered = [diff['new_path'] for diff in diffs] + get_logger().info(f"Filtered out [ignore] files for merge request {self.id_mr}", extra={ + 'original_files': names_original, + 'filtered_files': names_filtered + }) + except Exception as e: + pass + diff_files = [] + invalid_files_names = [] for diff in diffs: - if is_valid_file(diff['new_path']): - original_file_content_str = self.get_pr_file_content(diff['old_path'], self.mr.diff_refs['base_sha']) - new_file_content_str = self.get_pr_file_content(diff['new_path'], self.mr.diff_refs['head_sha']) + if not is_valid_file(diff['new_path']): + invalid_files_names.append(diff['new_path']) + continue - try: - if isinstance(original_file_content_str, bytes): - original_file_content_str = bytes.decode(original_file_content_str, 'utf-8') - if isinstance(new_file_content_str, bytes): - new_file_content_str = bytes.decode(new_file_content_str, 'utf-8') - except UnicodeDecodeError: - get_logger().warning( - f"Cannot decode file {diff['old_path']} or {diff['new_path']} in merge request {self.id_mr}") + original_file_content_str = self.get_pr_file_content(diff['old_path'], self.mr.diff_refs['base_sha']) + new_file_content_str = self.get_pr_file_content(diff['new_path'], self.mr.diff_refs['head_sha']) + try: + if isinstance(original_file_content_str, bytes): + original_file_content_str = bytes.decode(original_file_content_str, 'utf-8') + if isinstance(new_file_content_str, bytes): + new_file_content_str = bytes.decode(new_file_content_str, 'utf-8') + except UnicodeDecodeError: + get_logger().warning( + f"Cannot decode file {diff['old_path']} or {diff['new_path']} in merge request {self.id_mr}") - edit_type = EDIT_TYPE.MODIFIED - if diff['new_file']: - edit_type = EDIT_TYPE.ADDED - elif diff['deleted_file']: - edit_type = EDIT_TYPE.DELETED - elif diff['renamed_file']: - edit_type = EDIT_TYPE.RENAMED + edit_type = EDIT_TYPE.MODIFIED + if diff['new_file']: + edit_type = EDIT_TYPE.ADDED + elif diff['deleted_file']: + edit_type = EDIT_TYPE.DELETED + elif diff['renamed_file']: + edit_type = EDIT_TYPE.RENAMED - filename = diff['new_path'] - patch = diff['diff'] - if not patch: - patch = load_large_diff(filename, new_file_content_str, original_file_content_str) + filename = diff['new_path'] + patch = diff['diff'] + if not patch: + patch = load_large_diff(filename, new_file_content_str, original_file_content_str) - # count number of lines added and removed - patch_lines = patch.splitlines(keepends=True) - num_plus_lines = len([line for line in patch_lines if line.startswith('+')]) - num_minus_lines = len([line for line in patch_lines if line.startswith('-')]) - diff_files.append( - FilePatchInfo(original_file_content_str, new_file_content_str, - patch=patch, - filename=filename, - edit_type=edit_type, - old_filename=None if diff['old_path'] == diff['new_path'] else diff['old_path'], - num_plus_lines=num_plus_lines, - num_minus_lines=num_minus_lines, )) + # count number of lines added and removed + patch_lines = patch.splitlines(keepends=True) + num_plus_lines = len([line for line in patch_lines if line.startswith('+')]) + num_minus_lines = len([line for line in patch_lines if line.startswith('-')]) + diff_files.append( + FilePatchInfo(original_file_content_str, new_file_content_str, + patch=patch, + filename=filename, + edit_type=edit_type, + old_filename=None if diff['old_path'] == diff['new_path'] else diff['old_path'], + num_plus_lines=num_plus_lines, + num_minus_lines=num_minus_lines, )) + if invalid_files_names: + get_logger().info(f"Filtered out files with invalid extensions: {invalid_files_names}") self.diff_files = diff_files return diff_files - def get_files(self): + def get_files(self) -> list: if not self.git_files: self.git_files = [change['new_path'] for change in self.mr.changes()['changes']] return self.git_files diff --git a/pr_agent/settings/ignore.toml b/pr_agent/settings/ignore.toml index 429d3887..bc847cfc 100644 --- a/pr_agent/settings/ignore.toml +++ b/pr_agent/settings/ignore.toml @@ -8,4 +8,5 @@ glob = [ regex = [ # Ignore files and directories matching these regex patterns. # See https://learnbyexample.github.io/python-regex-cheatsheet/ + # for example: regex = ['.*\.toml$'] ] diff --git a/tests/unittest/test_convert_to_markdown.py b/tests/unittest/test_convert_to_markdown.py index e4918beb..d701efb2 100644 --- a/tests/unittest/test_convert_to_markdown.py +++ b/tests/unittest/test_convert_to_markdown.py @@ -52,7 +52,7 @@ class TestConvertToMarkdown: 'suggestion': "Consider raising an exception or logging a warning when 'pr_url' attribute is not found. This can help in debugging issues related to the absence of 'pr_url' in instances where it's expected. [important]\n", 'relevant_line': '[return ""](https://github.com/Codium-ai/pr-agent-pro/pull/102/files#diff-52d45f12b836f77ed1aef86e972e65404634ea4e2a6083fb71a9b0f9bb9e062fR199)'}]} - expected_output = '## PR Reviewer Guide ๐Ÿ”\n\n
      Feedback            
{emoji} {key_nice}\n\n{value}\n\n
{emoji} {key_nice}\n{value}\n\n
\n\n{issue}
\n{issue}
\n\n{issue}
\n{issue}
{emoji} {key_nice}\n\n{value}\n\n
{emoji} {key_nice}\n{value}\n\n
{emoji} {key_nice}\n\n{value}\n\n
{emoji} {key_nice}\n{value}\n\n
\n\n\n\n\n\n
โฑ๏ธ Estimated effort to review [1-5]\n\n1, because the changes are minimal and straightforward, focusing on a single functionality addition.\n\n\n
๐Ÿงช Relevant tests\n\nNo\n\n\n
โšก Possible issues\n\nNo\n\n\n
๐Ÿ”’ Security concerns\n\nNo\n\n
\n\n\n
Code feedback:\n\n
relevant filepr_agent/git_providers/git_provider.py\n
suggestion      \n\n\n\nConsider raising an exception or logging a warning when \'pr_url\' attribute is not found. This can help in debugging issues related to the absence of \'pr_url\' in instances where it\'s expected. [important]\n\n\n
relevant linereturn ""

\n\n
' + expected_output = '## PR Reviewer Guide ๐Ÿ”\n\n\n\n\n\n\n
โฑ๏ธ Estimated effort to review [1-5]\n1, because the changes are minimal and straightforward, focusing on a single functionality addition.\n\n\n
๐Ÿงช Relevant tests\nNo\n\n\n
โšก Possible issues\nNo\n\n\n
๐Ÿ”’ Security concerns\nNo\n\n
\n\n\n
Code feedback:\n\n
relevant filepr_agent/git_providers/git_provider.py\n
suggestion      \n\n\n\nConsider raising an exception or logging a warning when \'pr_url\' attribute is not found. This can help in debugging issues related to the absence of \'pr_url\' in instances where it\'s expected. [important]\n\n\n
relevant linereturn ""

\n\n
' assert convert_to_markdown(input_data).strip() == expected_output.strip()