Merge pull request #630 from Codium-ai/tr/language

Enhancements in Patch Formatting and Code Suggestions Handling
2025-07-21 04:50:39 +08:00 · 2024-01-29 12:11:23 -08:00
parent 6998089549 15c8fe94bb
commit c69962479a
7 changed files with 37 additions and 29 deletions
--- a/pr_agent/algo/git_patch_processing.py
+++ b/pr_agent/algo/git_patch_processing.py
@ -181,7 +181,7 @@ __old hunk__
           ...
    """
    
-    patch_with_lines_str = f"\n\n## {file.filename}\n"
+    patch_with_lines_str = f"\n\n## file: '{file.filename.strip()}'\n"
    patch_lines = patch.splitlines()
    RE_HUNK_HEADER = re.compile(
        r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
@ -202,11 +202,11 @@ __old hunk__
                if new_content_lines:
                    if prev_header_line:
                        patch_with_lines_str += f'\n{prev_header_line}\n'
-                    patch_with_lines_str += '__new hunk__\n'
+                    patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__new hunk__\n'
                    for i, line_new in enumerate(new_content_lines):
                        patch_with_lines_str += f"{start2 + i} {line_new}\n"
                if old_content_lines:
-                    patch_with_lines_str += '__old hunk__\n'
+                    patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__old hunk__\n'
                    for line_old in old_content_lines:
                        patch_with_lines_str += f"{line_old}\n"
                new_content_lines = []
@ -236,11 +236,11 @@ __old hunk__
    if match and new_content_lines:
        if new_content_lines:
            patch_with_lines_str += f'\n{header_line}\n'
-            patch_with_lines_str += '\n__new hunk__\n'
+            patch_with_lines_str = patch_with_lines_str.rstrip()+ '\n__new hunk__\n'
            for i, line_new in enumerate(new_content_lines):
                patch_with_lines_str += f"{start2 + i} {line_new}\n"
        if old_content_lines:
-            patch_with_lines_str += '\n__old hunk__\n'
+            patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
            for line_old in old_content_lines:
                patch_with_lines_str += f"{line_old}\n"

--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@ -209,9 +209,9 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo

        if patch:
            if not convert_hunks_to_line_numbers:
-                patch_final = f"## {file.filename}\n\n{patch}\n"
+                patch_final = f"\n\n## file: '{file.filename.strip()}\n\n{patch.strip()}\n'"
            else:
-                patch_final = patch
+                patch_final = "\n\n" + patch.strip()
            patches.append(patch_final)
            total_tokens += token_handler.count_tokens(patch_final)
            if get_settings().config.verbosity_level >= 2:
@ -375,6 +375,13 @@ def get_pr_multi_diffs(git_provider: GitProvider,
    for lang in pr_languages:
        sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True))

+
+    # try first a single run with standard diff string, with patch extension, and no deletions
+    patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff(
+        pr_languages, token_handler, add_line_numbers_to_hunks=True)
+    if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < get_max_tokens(model):
+        return ["\n".join(patches_extended)]
+
    patches = []
    final_diff_list = []
    total_tokens = token_handler.prompt_tokens
--- a/pr_agent/settings/pr_add_docs.toml
+++ b/pr_agent/settings/pr_add_docs.toml
@ -5,7 +5,7 @@ Your task is to generate {{ docs_for_language }} for code components in the PR D

 Example for the PR Diff format:
 ======
-## src/file1.py
+## file: 'src/file1.py'

@@ -12,3 +12,4 @@ def func1():
 __new hunk__
@ -18,7 +18,6 @@ __old hunk__
 -code line that was removed in the PR
 code line2 that remained unchanged in the PR

-
@@ ... @@ def func2():
 __new hunk__
 ...
@ -26,7 +25,7 @@ __old hunk__
 ...


-## src/file2.py
+## file: 'src/file2.py'
 ...
 ======

--- a/pr_agent/settings/pr_code_suggestions_prompts.toml
+++ b/pr_agent/settings/pr_code_suggestions_prompts.toml
@ -4,7 +4,7 @@ Your task is to provide meaningful and actionable code suggestions, to improve t

 Example for the PR Diff format:
 ======
-## src/file1.py
+## file: 'src/file1.py'

@@ ... @@ def func1():
 __new hunk__
@ -16,7 +16,6 @@ __old hunk__
 -old code line2 that was removed in the PR
 code line3 that remained unchanged in the PR

-
@@ ... @@ def func2():
 __new hunk__
 ...
@ -24,7 +23,7 @@ __old hunk__
 ...


-## src/file2.py
+## file: 'src/file2.py'
 ...
 ======

@ -51,6 +50,7 @@ The output must be a YAML object equivalent to type $PRCodeSuggestions, accordin
 =====
 class CodeSuggestion(BaseModel):
    relevant_file: str = Field(description="the relevant file full path")
+    language: str = Field(description="the code language of the relevant file")
    suggestion_content: str = Field(description="an actionable suggestion for meaningfully improving the new code introduced in the PR")
 {%- if summarize_mode %}
    existing_code: str = Field(description="a short code snippet from a '__new hunk__' section to illustrate the relevant existing code. Don't show the line numbers.")
@ -74,6 +74,8 @@ Example output:
 code_suggestions:
 - relevant_file: |-
    src/file1.py
+  language: |-
+    python
  suggestion_content: |-
    Add a docstring to func1()
 {%- if summarize_mode %}
@ -105,11 +107,6 @@ user="""PR Info:

 Title: '{{title}}'

-{%- if language %}
-
-Main PR language: '{{ language }}'
-{%- endif %}
-

 The PR Diff:
 ======
--- a/pr_agent/settings/pr_description_prompts.toml
+++ b/pr_agent/settings/pr_description_prompts.toml
@ -39,6 +39,7 @@ class PRType(str, Enum):

 Class FileDescription(BaseModel):
    filename: str = Field(description="the relevant file full path")
+    language: str = Field(description="the relevant file language")
    changes_summary: str = Field(description="concise summary of the changes in the relevant file, in bullet points (1-4 bullet points).")
    changes_title: str = Field(description="an informative title for the changes in the files, describing its main theme (5-10 words).")
    label: str = Field(description="a single semantic label that represents a type of code changes that occurred in the File. Possible values (partial list): 'bug fix', 'tests', 'enhancement', 'documentation', 'error handling', 'configuration changes', 'dependencies', 'formatting', 'miscellaneous', ...")
@ -67,6 +68,8 @@ type:
 pr_files:
 - filename: |
    ...
+  language: |
+    ...
  changes_summary: |
    ...
  changes_title: |
@ -104,10 +107,7 @@ Previous description:
 {%- endif %}

 Branch: '{{branch}}'
-{%- if language %}

-Main PR language: '{{ language }}'
-{%- endif %}
 {%- if commit_messages_str %}

 Commit messages:
--- a/pr_agent/settings/pr_reviewer_prompts.toml
+++ b/pr_agent/settings/pr_reviewer_prompts.toml
@ -5,7 +5,7 @@ The review should focus on new code added in the PR diff (lines starting with '+

 Example PR Diff:
 ======
-## src/file1.py
+## file: 'src/file1.py'

@@ -12,5 +12,5 @@ def func1():
 code line 1 that remained unchanged in the PR
@ -14,12 +14,11 @@ code line 2 that remained unchanged in the PR
 +code line added in the PR
 code line 3 that remained unchanged in the PR

-
@@ ... @@ def func2():
 ...


-## src/file2.py
+## file: 'src/file2.py'
 ...
 ======

@ -115,6 +114,9 @@ PR Feedback:
      relevant file:
        type: string
        description: the relevant file full path
+      language:
+        type: string
+        description: the language of the relevant file
      suggestion:
        type: string
        description: |-
@ -166,6 +168,8 @@ PR Feedback:
  Code feedback:
    - relevant file: |-
        directory/xxx.py
+      language: |-
+        python
      suggestion: |-
        xxx [important]
      relevant line: |-
@ -195,10 +199,6 @@ Description:
 ======
 {%- endif %}

-{%- if language %}
-
-Main PR language: '{{ language }}'
-{%- endif %}
 {%- if commit_messages_str %}

 Commit messages:
--- a/pr_agent/tools/pr_code_suggestions.py
+++ b/pr_agent/tools/pr_code_suggestions.py
@ -226,7 +226,7 @@ class PRCodeSuggestions:
        for i, patches_diff in enumerate(patches_diff_list):
            get_logger().info(f"Processing chunk {i + 1} of {len(patches_diff_list)}")
            self.patches_diff = patches_diff
-            prediction = await self._get_prediction(model)
+            prediction = await self._get_prediction(model) # toDo: parallelize
            prediction_list.append(prediction)
        self.prediction_list = prediction_list

@ -253,10 +253,15 @@ class PRCodeSuggestions:
        """

        suggestion_list = []
+        if not data:
+            return suggestion_list
        for suggestion in data:
            suggestion_list.append(suggestion)
        data_sorted = [[]] * len(suggestion_list)

+        if len(suggestion_list ) == 1:
+            return suggestion_list
+
        try:
            suggestion_str = ""
            for i, suggestion in enumerate(suggestion_list):