find_line_number_of_relevant_line_in_file

2025-07-21 04:50:39 +08:00 · 2023-08-05 10:34:09 +03:00
parent bd86266a4b
commit fed0ea349a
7 changed files with 137 additions and 63 deletions
--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@ -1,8 +1,9 @@
 from __future__ import annotations

+import re
+import difflib
 import logging
-from typing import Callable, Tuple
-
+from typing import Callable, Tuple, List, Any, Sequence
 from github import RateLimitExceededException

 from pr_agent.algo import MAX_TOKENS
@ -10,7 +11,7 @@ from pr_agent.algo.git_patch_processing import convert_to_hunks_with_lines_numbe
 from pr_agent.algo.language_handler import sort_files_by_main_languages
 from pr_agent.algo.token_handler import TokenHandler
 from pr_agent.config_loader import get_settings
-from pr_agent.git_providers.git_provider import GitProvider
+from pr_agent.git_providers.git_provider import GitProvider, FilePatchInfo

 DELETED_FILES_ = "Deleted files:\n"

@ -217,3 +218,53 @@ async def retry_with_fallback_models(f: Callable):
            logging.warning(f"Failed to generate prediction with {model}: {e}")
            if i == len(all_models) - 1:  # If it's the last iteration
                raise  # Re-raise the last exception
+
+
+def find_line_number_of_relevant_line_in_file(diff_files: list[FilePatchInfo], relevant_file: str,
+                                              relevant_line_in_file: str) -> Tuple[int, int]:
+    position = -1
+    absolute_position = -1
+    RE_HUNK_HEADER = re.compile(
+        r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
+    for file in diff_files:
+        if file.filename.strip() == relevant_file:
+            patch = file.patch
+            patch_lines = patch.splitlines()
+
+            # try to find the line in the patch using difflib, with some margin of error
+            matches_difflib: list[str | Any] = difflib.get_close_matches(relevant_line_in_file,
+                                                                         file.patch.splitlines(), n=3, cutoff=0.95)
+            if len(matches_difflib) == 1 and matches_difflib[0].startswith('+'):
+                relevant_line_in_file = matches_difflib[0]
+
+            delta = 0
+            for i, line in enumerate(patch_lines):
+
+                if line.startswith('@@'):
+                    delta = 0
+                    match = RE_HUNK_HEADER.match(line)
+                    start1, size1, start2, size2 = map(int, match.groups()[:4])
+                elif not line.startswith('-'):
+                    delta += 1
+
+                if relevant_line_in_file in line and line[0] != '-':
+                    position = i
+                    absolute_position = start2 + delta - 1
+                    break
+            if position == -1:
+                for i, line in enumerate(patch_lines):
+                    if line.startswith('@@'):
+                        delta = 0
+                        match = RE_HUNK_HEADER.match(line)
+                        start1, size1, start2, size2 = map(int, match.groups()[:4])
+                    elif not line.startswith('-'):
+                        delta += 1
+
+                    if relevant_line_in_file[0] == '+' and relevant_line_in_file[1:].lstrip() in line and line[
+                        0] != '-':
+                        # The model often adds a '+' to the beginning of the relevant_line_in_file even if originally
+                        # it's a context line
+                        position = i
+                        absolute_position = start2 + delta - 1
+                        break
+    return position, absolute_position
--- a/pr_agent/algo/utils.py
+++ b/pr_agent/algo/utils.py
@ -40,7 +40,7 @@ def convert_to_markdown(output_data: dict) -> str:
        "Security concerns": "🔒",
        "General PR suggestions": "💡",
        "Insights from user's answers": "📝",
-        "Code suggestions": "🤖",
+        "Code feedback": "🤖",
    }

    for key, value in output_data.items():
@ -50,12 +50,12 @@ def convert_to_markdown(output_data: dict) -> str:
            markdown_text += f"## {key}\n\n"
            markdown_text += convert_to_markdown(value)
        elif isinstance(value, list):
-            if key.lower() == 'code suggestions':
+            if key.lower() == 'code feedback':
                markdown_text += "\n"  # just looks nicer with additional line breaks
            emoji = emojis.get(key, "")
            markdown_text += f"- {emoji} **{key}:**\n\n"
            for item in value:
-                if isinstance(item, dict) and key.lower() == 'code suggestions':
+                if isinstance(item, dict) and key.lower() == 'code feedback':
                    markdown_text += parse_code_suggestion(item)
                elif item:
                    markdown_text += f"  - {item}\n"
@ -100,7 +100,7 @@ def try_fix_json(review, max_iter=10, code_suggestions=False):
    Args:
    - review: A string containing the JSON message to be fixed.
    - max_iter: An integer representing the maximum number of iterations to try and fix the JSON message.
-    - code_suggestions: A boolean indicating whether to try and fix JSON messages with code suggestions.
+    - code_suggestions: A boolean indicating whether to try and fix JSON messages with code feedback.

    Returns:
    - data: A dictionary containing the parsed JSON data.
@ -108,7 +108,7 @@ def try_fix_json(review, max_iter=10, code_suggestions=False):
    The function attempts to fix broken or incomplete JSON messages by parsing until the last valid code suggestion.
    If the JSON message ends with a closing bracket, the function calls the fix_json_escape_char function to fix the
    message.
-    If code_suggestions is True and the JSON message contains code suggestions, the function tries to fix the JSON
+    If code_suggestions is True and the JSON message contains code feedback, the function tries to fix the JSON
    message by parsing until the last valid code suggestion.
    The function uses regular expressions to find the last occurrence of "}," with any number of whitespaces or
    newlines.
@ -128,7 +128,8 @@ def try_fix_json(review, max_iter=10, code_suggestions=False):
    else:
        closing_bracket = "]}}"

-    if review.rfind("'Code suggestions': [") > 0 or review.rfind('"Code suggestions": [') > 0:
+    if (review.rfind("'Code feedback': [") > 0 or review.rfind('"Code feedback": [') > 0) or \
+            (review.rfind("'Code suggestions': [") > 0 or review.rfind('"Code suggestions": [') > 0) :
        last_code_suggestion_ind = [m.end() for m in re.finditer(r"\}\s*,", review)][-1] - 1
        valid_json = False
        iter_count = 0