full code suggestions

2025-07-21 04:50:39 +08:00 · 2023-07-15 09:30:50 +03:00
parent 23a249ccdb
commit 4f4989af8c
11 changed files with 382 additions and 13 deletions
--- a/pr_agent/algo/git_patch_processing.py
+++ b/pr_agent/algo/git_patch_processing.py
@ -108,3 +108,78 @@ def handle_patch_deletions(patch: str, original_file_content_str: str,
                logging.info(f"Processing file: {file_name}, hunks were deleted")
            patch = patch_new
    return patch
+
+
+def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
+    # toDO: (maybe remove '-' and '+' from the beginning of the line)
+    """
+    ## src/file.ts
+--new hunk--
+881        line1
+882        line2
+883        line3
+884        line4
+885        line6
+886        line7
+887 +      line8
+888 +      line9
+889        line10
+890        line11
+...
+--old hunk--
+        line1
+        line2
+-       line3
+-       line4
+        line5
+        line6
+           ...
+
+    """
+    patch_with_lines_str = f"## {file.filename}\n"
+    import re
+    patch_lines = patch.splitlines()
+    RE_HUNK_HEADER = re.compile(
+        r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
+    new_content_lines = []
+    old_content_lines = []
+    match = None
+    start1, size1, start2, size2 = -1, -1, -1, -1
+    for line in patch_lines:
+        if 'no newline at end of file' in line.lower():
+            continue
+
+        if line.startswith('@@'):
+            match = RE_HUNK_HEADER.match(line)
+            if match and new_content_lines:  # found a new hunk, split the previous lines
+                if new_content_lines:
+                    patch_with_lines_str += '\n--new hunk--\n'
+                    for i, line_new in enumerate(new_content_lines):
+                        patch_with_lines_str += f"{start2 + i} {line_new}\n"
+                if old_content_lines:
+                    patch_with_lines_str += '--old hunk--\n'
+                    for i, line_old in enumerate(old_content_lines):
+                        patch_with_lines_str += f"{line_old}\n"
+                new_content_lines = []
+                old_content_lines = []
+            start1, size1, start2, size2 = map(int, match.groups()[:4])
+        elif line.startswith('+'):
+            new_content_lines.append(line)
+        elif line.startswith('-'):
+            old_content_lines.append(line)
+        else:
+            new_content_lines.append(line)
+            old_content_lines.append(line)
+
+    # finishing last hunk
+    if match and new_content_lines:
+        if new_content_lines:
+            patch_with_lines_str += '\n--new hunk--\n'
+            for i, line_new in enumerate(new_content_lines):
+                patch_with_lines_str += f"{start2 + i} {line_new}\n"
+        if old_content_lines:
+            patch_with_lines_str += '\n--old hunk--\n'
+            for i, line_old in enumerate(old_content_lines):
+                patch_with_lines_str += f"{line_old}\n"
+
+    return patch_with_lines_str.strip()
--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@ -4,7 +4,8 @@ import difflib
 import logging
 from typing import Any, Tuple, Union

-from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions
+from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions, \
+    convert_to_hunks_with_lines_numbers
 from pr_agent.algo.language_handler import sort_files_by_main_languages
 from pr_agent.algo.token_handler import TokenHandler
 from pr_agent.config_loader import settings
@ -19,26 +20,33 @@ OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600
 PATCH_EXTRA_LINES = 3


-def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler) -> str:
+def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler,
+                add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool =False) -> str:
    """
    Returns a string with the diff of the PR.
    If needed, apply diff minimization techniques to reduce the number of tokens
    """
+    if disable_extra_lines:
+        global PATCH_EXTRA_LINES
+        PATCH_EXTRA_LINES = 0
+
    git_provider.pr.diff_files = list(git_provider.get_diff_files())

    # get pr languages
    pr_languages = sort_files_by_main_languages(git_provider.get_languages(), git_provider.pr.diff_files)

    # generate a standard diff string, with patch extension
-    patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler)
+    patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler,
+                                                               add_line_numbers_to_hunks)

    # if we are under the limit, return the full diff
    if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < token_handler.limit:
        return "\n".join(patches_extended)

    # if we are over the limit, start pruning
-    patches_compressed, modified_file_names, deleted_file_names = pr_generate_compressed_diff(pr_languages,
-                                                                                              token_handler)
+    patches_compressed, modified_file_names, deleted_file_names = \
+        pr_generate_compressed_diff(pr_languages, token_handler, add_line_numbers_to_hunks)
+
    final_diff = "\n".join(patches_compressed)
    if modified_file_names:
        modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names)
@ -49,7 +57,8 @@ def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHa
    return final_diff


-def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -> \
+def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler,
+                              add_line_numbers_to_hunks: bool) -> \
        Tuple[list, int]:
    """
    Generate a standard diff string, with patch extension
@ -72,6 +81,9 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -
            extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES)
            full_extended_patch = f"## {file.filename}\n\n{extended_patch}\n"

+            if add_line_numbers_to_hunks:
+                full_extended_patch = convert_to_hunks_with_lines_numbers(extended_patch, file)
+
            patch_tokens = token_handler.count_tokens(full_extended_patch)
            file.tokens = patch_tokens
            total_tokens += patch_tokens
@ -80,7 +92,8 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -
    return patches_extended, total_tokens


-def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> Tuple[list, list, list]:
+def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler,
+                                convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list]:
    # Apply Diff Minimization techniques to reduce the number of tokens:
    # 0. Start from the largest diff patch to smaller ones
    # 1. Don't use extend context lines around diff
@ -114,6 +127,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) ->
            deleted_files_list.append(file.filename)
            total_tokens += token_handler.count_tokens(file.filename) + 1
            continue
+
+        if convert_hunks_to_line_numbers:
+            patch = convert_to_hunks_with_lines_numbers(patch, file)
+
        new_patch_tokens = token_handler.count_tokens(patch)

        # Hard Stop, no more tokens
@ -135,7 +152,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) ->
            continue

        if patch:
-            patch_final = f"## {file.filename}\n\n{patch}\n"
+            if not convert_hunks_to_line_numbers:
+                patch_final = f"## {file.filename}\n\n{patch}\n"
+            else:
+                patch_final = patch
            patches.append(patch_final)
            total_tokens += token_handler.count_tokens(patch_final)
            if settings.config.verbosity_level >= 2: