From 9c82047dc3096ef0533e8b29eb043fb40f782919 Mon Sep 17 00:00:00 2001 From: mrT23 Date: Tue, 12 Nov 2024 07:50:37 +0200 Subject: [PATCH 1/2] Add validation for hunk lines matching original file content in git patch processing --- pr_agent/algo/git_patch_processing.py | 30 ++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py index 0ccce92d..9b319746 100644 --- a/pr_agent/algo/git_patch_processing.py +++ b/pr_agent/algo/git_patch_processing.py @@ -31,7 +31,7 @@ def extend_patch(original_file_str, patch_str, patch_extra_lines_before=0, def decode_if_bytes(original_file_str): - if isinstance(original_file_str, bytes): + if isinstance(original_file_str, (bytes, bytearray)): try: return original_file_str.decode('utf-8') except UnicodeDecodeError: @@ -61,23 +61,26 @@ def process_patch_lines(patch_str, original_file_str, patch_extra_lines_before, patch_lines = patch_str.splitlines() extended_patch_lines = [] + is_valid_hunk = True start1, size1, start2, size2 = -1, -1, -1, -1 RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") try: - for line in patch_lines: + for i,line in enumerate(patch_lines): if line.startswith('@@'): match = RE_HUNK_HEADER.match(line) # identify hunk header if match: # finish processing previous hunk - if start1 != -1 and patch_extra_lines_after > 0: + if is_valid_hunk and (start1 != -1 and patch_extra_lines_after > 0): delta_lines = [f' {line}' for line in original_lines[start1 + size1 - 1:start1 + size1 - 1 + patch_extra_lines_after]] extended_patch_lines.extend(delta_lines) section_header, size1, size2, start1, start2 = extract_hunk_headers(match) - if patch_extra_lines_before > 0 or patch_extra_lines_after > 0: + is_valid_hunk = check_if_hunk_lines_matches_to_file(i, original_lines, patch_lines, start1) + + if is_valid_hunk and (patch_extra_lines_before > 0 or patch_extra_lines_after > 0): def _calc_context_limits(patch_lines_before): extended_start1 = max(1, start1 - patch_lines_before) extended_size1 = size1 + (start1 - extended_start1) + patch_extra_lines_after @@ -138,7 +141,7 @@ def process_patch_lines(patch_str, original_file_str, patch_extra_lines_before, return patch_str # finish processing last hunk - if start1 != -1 and patch_extra_lines_after > 0: + if start1 != -1 and patch_extra_lines_after > 0 and is_valid_hunk: delta_lines = original_lines[start1 + size1 - 1:start1 + size1 - 1 + patch_extra_lines_after] # add space at the beginning of each extra line delta_lines = [f' {line}' for line in delta_lines] @@ -148,6 +151,23 @@ def process_patch_lines(patch_str, original_file_str, patch_extra_lines_before, return extended_patch_str +def check_if_hunk_lines_matches_to_file(i, original_lines, patch_lines, start1): + """ + Check if the hunk lines match the original file content. We saw cases where the hunk header line doesn't match the original file content, and then + extending the hunk with extra lines before the hunk header can cause the hunk to be invalid. + """ + is_valid_hunk = True + try: + if i + 1 < len(patch_lines) and patch_lines[i + 1][0] == ' ': # an existing line in the file + if patch_lines[i + 1].strip() != original_lines[start1 - 1].strip(): + is_valid_hunk = False + get_logger().error( + f"Invalid hunk in PR, line {start1} in hunk header doesn't match the original file content") + except: + pass + return is_valid_hunk + + def extract_hunk_headers(match): res = list(match.groups()) for i in range(len(res)): From 065777040f34205b68cf05fd9a6a69975ab31f01 Mon Sep 17 00:00:00 2001 From: mrT23 Date: Tue, 12 Nov 2024 08:06:02 +0200 Subject: [PATCH 2/2] Improve PR file content retrieval and logging verbosity handling --- pr_agent/git_providers/github_provider.py | 62 +++++++++++++---------- pr_agent/tools/pr_code_suggestions.py | 2 + 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/pr_agent/git_providers/github_provider.py b/pr_agent/git_providers/github_provider.py index 4137ea48..1efa8744 100644 --- a/pr_agent/git_providers/github_provider.py +++ b/pr_agent/git_providers/github_provider.py @@ -199,7 +199,24 @@ class GithubProvider(GitProvider): if avoid_load: original_file_content_str = "" else: - original_file_content_str = self._get_pr_file_content(file, self.pr.base.sha) + # The base.sha will point to the current state of the base branch (including parallel merges), not the original base commit when the PR was created + # We can fix this by finding the merge base commit between the PR head and base branches + # Note that The pr.head.sha is actually correct as is - it points to the latest commit in your PR branch. + # This SHA isn't affected by parallel merges to the base branch since it's specific to your PR's branch. + repo = self.repo_obj + pr = self.pr + try: + compare = repo.compare(pr.base.sha, pr.head.sha) + merge_base_commit = compare.merge_base_commit + except Exception as e: + get_logger().error(f"Failed to get merge base commit: {e}") + merge_base_commit = pr.base + if merge_base_commit.sha != pr.base.sha: + get_logger().info( + f"Using merge base commit {merge_base_commit.sha} instead of base commit " + f"{pr.base.sha} for {file.filename}") + original_file_content_str = self._get_pr_file_content(file, merge_base_commit.sha) + if not patch: patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str) @@ -283,8 +300,7 @@ class GithubProvider(GitProvider): relevant_line_in_file, absolute_position) if position == -1: - if get_settings().config.verbosity_level >= 2: - get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") + get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") subject_type = "FILE" else: subject_type = "LINE" @@ -296,11 +312,9 @@ class GithubProvider(GitProvider): # publish all comments in a single message self.pr.create_review(commit=self.last_commit_id, comments=comments) except Exception as e: - if get_settings().config.verbosity_level >= 2: - get_logger().error(f"Failed to publish inline comments") + get_logger().info(f"Initially failed to publish inline comments as committable") - if (getattr(e, "status", None) == 422 - and get_settings().github.publish_inline_comments_fallback_with_verification and not disable_fallback): + if (getattr(e, "status", None) == 422 and not disable_fallback): pass # continue to try _publish_inline_comments_fallback_with_verification else: raise e # will end up with publishing the comments one by one @@ -308,8 +322,7 @@ class GithubProvider(GitProvider): try: self._publish_inline_comments_fallback_with_verification(comments) except Exception as e: - if get_settings().config.verbosity_level >= 2: - get_logger().error(f"Failed to publish inline code comments fallback, error: {e}") + get_logger().error(f"Failed to publish inline code comments fallback, error: {e}") raise e def _publish_inline_comments_fallback_with_verification(self, comments: list[dict]): @@ -334,11 +347,9 @@ class GithubProvider(GitProvider): for comment in fixed_comments_as_one_liner: try: self.publish_inline_comments([comment], disable_fallback=True) - if get_settings().config.verbosity_level >= 2: - get_logger().info(f"Published invalid comment as a single line comment: {comment}") + get_logger().info(f"Published invalid comment as a single line comment: {comment}") except: - if get_settings().config.verbosity_level >= 2: - get_logger().error(f"Failed to publish invalid comment as a single line comment: {comment}") + get_logger().error(f"Failed to publish invalid comment as a single line comment: {comment}") def _verify_code_comment(self, comment: dict): is_verified = False @@ -396,8 +407,7 @@ class GithubProvider(GitProvider): if fixed_comment != comment: fixed_comments.append(fixed_comment) except Exception as e: - if get_settings().config.verbosity_level >= 2: - get_logger().error(f"Failed to fix inline comment, error: {e}") + get_logger().error(f"Failed to fix inline comment, error: {e}") return fixed_comments def publish_code_suggestions(self, code_suggestions: list) -> bool: @@ -412,16 +422,14 @@ class GithubProvider(GitProvider): relevant_lines_end = suggestion['relevant_lines_end'] if not relevant_lines_start or relevant_lines_start == -1: - if get_settings().config.verbosity_level >= 2: - get_logger().exception( - f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}") + get_logger().exception( + f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}") continue if relevant_lines_end < relevant_lines_start: - if get_settings().config.verbosity_level >= 2: - get_logger().exception(f"Failed to publish code suggestion, " - f"relevant_lines_end is {relevant_lines_end} and " - f"relevant_lines_start is {relevant_lines_start}") + get_logger().exception(f"Failed to publish code suggestion, " + f"relevant_lines_end is {relevant_lines_end} and " + f"relevant_lines_start is {relevant_lines_start}") continue if relevant_lines_end > relevant_lines_start: @@ -445,8 +453,7 @@ class GithubProvider(GitProvider): self.publish_inline_comments(post_parameters_list) return True except Exception as e: - if get_settings().config.verbosity_level >= 2: - get_logger().error(f"Failed to publish code suggestion, error: {e}") + get_logger().error(f"Failed to publish code suggestion, error: {e}") return False def edit_comment(self, comment, body: str): @@ -505,6 +512,7 @@ class GithubProvider(GitProvider): elif self.deployment_type == 'user': same_comment_creator = self.github_user_id == existing_comment['user']['login'] if existing_comment['subject_type'] == 'file' and comment['path'] == existing_comment['path'] and same_comment_creator: + headers, data_patch = self.pr._requester.requestJsonAndCheck( "PATCH", f"{self.base_url}/repos/{self.repo}/pulls/comments/{existing_comment['id']}", input={"body":comment['body']} ) @@ -516,8 +524,7 @@ class GithubProvider(GitProvider): ) return True except Exception as e: - if get_settings().config.verbosity_level >= 2: - get_logger().error(f"Failed to publish diffview file summary, error: {e}") + get_logger().error(f"Failed to publish diffview file summary, error: {e}") return False def remove_initial_comment(self): @@ -805,8 +812,7 @@ class GithubProvider(GitProvider): link = f"{self.base_url_html}/{self.repo}/pull/{self.pr_num}/files#diff-{sha_file}R{absolute_position}" return link except Exception as e: - if get_settings().config.verbosity_level >= 2: - get_logger().info(f"Failed adding line link, error: {e}") + get_logger().info(f"Failed adding line link, error: {e}") return "" diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py index e2bc279e..31ad2bd9 100644 --- a/pr_agent/tools/pr_code_suggestions.py +++ b/pr_agent/tools/pr_code_suggestions.py @@ -337,6 +337,8 @@ class PRCodeSuggestions: model, add_line_numbers_to_hunks=True, disable_extra_lines=False) + self.patches_diff_list = [self.patches_diff] + self.patches_diff_no_line_number = self.remove_line_numbers([self.patches_diff])[0] if self.patches_diff: get_logger().debug(f"PR diff", artifact=self.patches_diff)