Merge pull request #1159 from Codium-ai/tr/patches

Tr/patches
2025-07-02 11:50:37 +08:00 · 2024-08-20 11:41:05 +03:00
parent 9bec97c66c d467f5a7fd
commit 772499fce1
5 changed files with 57 additions and 41 deletions
--- a/pr_agent/algo/git_patch_processing.py
+++ b/pr_agent/algo/git_patch_processing.py
@ -7,7 +7,8 @@ from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
 from pr_agent.log import get_logger


-def extend_patch(original_file_str, patch_str, patch_extra_lines_before=0, patch_extra_lines_after=0) -> str:
+def extend_patch(original_file_str, patch_str, patch_extra_lines_before=0,
+                 patch_extra_lines_after=0, filename: str = "") -> str:
    if not patch_str or (patch_extra_lines_before == 0 and patch_extra_lines_after == 0) or not original_file_str:
        return patch_str

@ -17,6 +18,13 @@ def extend_patch(original_file_str, patch_str, patch_extra_lines_before=0, patch
        except UnicodeDecodeError:
            return ""

+    # skip patches
+    patch_extension_skip_types = get_settings().config.patch_extension_skip_types #[".md",".txt"]
+    if patch_extension_skip_types and filename:
+        if any([filename.endswith(skip_type) for skip_type in patch_extension_skip_types]):
+            return patch_str
+
+    # dynamic context settings
    allow_dynamic_context = get_settings().config.allow_dynamic_context
    max_extra_lines_before_dynamic_context = get_settings().config.max_extra_lines_before_dynamic_context
    patch_extra_lines_before_dynamic = patch_extra_lines_before
--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@ -191,7 +191,7 @@ def pr_generate_extended_diff(pr_languages: list,

            # extend each patch with extra lines of context
            extended_patch = extend_patch(original_file_content_str, patch,
-                                          patch_extra_lines_before, patch_extra_lines_after)
+                                          patch_extra_lines_before, patch_extra_lines_after, file.filename)
            if not extended_patch:
                get_logger().warning(f"Failed to extend patch for file: {file.filename}")
                continue
--- a/pr_agent/settings/configuration.toml
+++ b/pr_agent/settings/configuration.toml
@ -2,7 +2,7 @@
 # models
 model="gpt-4-turbo-2024-04-09"
 model_turbo="gpt-4o-2024-08-06"
-fallback_models=["gpt-4-0125-preview"]
+fallback_models=["gpt-4o-2024-05-13"]
 # CLI
 git_provider="github"
 publish_output=true
@ -19,7 +19,8 @@ max_description_tokens = 500
 max_commits_tokens = 500
 max_model_tokens = 32000 # Limits the maximum number of tokens that can be used by any model, regardless of the model's default capabilities.
 custom_model_max_tokens=-1 # for models not in the default list
-#
+# patch extension logic
+patch_extension_skip_types =[".md",".txt"]
 allow_dynamic_context=false
 max_extra_lines_before_dynamic_context = 10 # will try to include up to 10 extra lines before the hunk in the patch, until we reach an enclosing function or class
 patch_extra_lines_before = 3 # Number of extra lines (+3 default ones) to include before each hunk in the patch
--- a/pr_agent/settings/pr_code_suggestions_prompts.toml
+++ b/pr_agent/settings/pr_code_suggestions_prompts.toml
@ -1,6 +1,6 @@
 [pr_code_suggestions_prompt]
-system="""You are PR-Reviewer, a language model that specializes in suggesting ways to improve for a Pull Request (PR) code.
-Your task is to provide meaningful and actionable code suggestions, to improve the new code presented in a PR diff.
+system="""You are PR-Reviewer, a language model that specializes in suggesting improvements to a Pull Request (PR) code.
+Your task is to provide meaningful and actionable code suggestions, to improve the new code presented in a PR code diff (lines starting with '+').


 The format we will use to present the PR code diff:
@ -9,13 +9,15 @@ The format we will use to present the PR code diff:

@@ ... @@ def func1():
 __new hunk__
-12  code line1 that remained unchanged in the PR
+11  unchanged code line0 in the PR
+12  unchanged code line1 in the PR
 13 +new code line2 added in the PR
-14  code line3 that remained unchanged in the PR
+14  unchanged code line3 in the PR
 __old hunk__
- code line1 that remained unchanged in the PR
-old code line2 that was removed in the PR
- code line3 that remained unchanged in the PR
+ unchanged code line0
+ unchanged code line1
+-old code line2 removed in the PR
+ unchanged code line3

@@ ... @@ def func2():
 __new hunk__
@ -27,15 +29,15 @@ __old hunk__
 ## file: 'src/file2.py'
 ...
 ======
- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
- We also added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
+
+- In this format, we separate each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
+- We also added line numbers for the '__new hunk__' code, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and should only used for reference.
 - Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \
-Suggestions should always focus on ways to improve the new code lines introduced in the PR, meaning lines in the '__new hunk__' sections that begin with a '+' symbol (after the line numbers). The '__old hunk__' sections code is for context and reference only.


 Specific instructions for generating code suggestions:
 - Provide in total up to {{ num_code_suggestions }} code suggestions. The suggestions should be diverse and insightful.
- The suggestions should focus on improving the new code introduced the PR, meaning lines from '__new hunk__' sections, starting with '+' (after the line numbers).
+- The suggestions should focus on improving only the new code introduced the PR, meaning lines from '__new hunk__' sections, starting with '+' (after the line numbers). The '__old hunk__' sections code is for context and reference only.
 - Prioritize suggestions that address possible issues, major problems, and bugs in the PR code.
 - Don't suggest to add docstring, type hints, or comments, or to remove unused imports.
 - Suggestions should not repeat code already present in the '__new hunk__' sections.
@ -97,7 +99,7 @@ code_suggestions:
 Each YAML output MUST be after a newline, indented, with block scalar indicator ('|').
 """

-user="""PR Info:
+user="""--PR Info--

 Title: '{{title}}'

@ -114,8 +116,8 @@ Response (should be a valid YAML, and nothing else):


 [pr_code_suggestions_prompt_claude]
-system="""You are PR-Reviewer, a language model that specializes in suggesting ways to improve for a Pull Request (PR) code.
-Your task is to provide meaningful and actionable code suggestions, to improve the new code presented in a PR diff.
+system="""You are PR-Reviewer, a language model that specializes in suggesting improvements to a Pull Request (PR) code.
+Your task is to provide meaningful and actionable code suggestions, to improve the new code presented in a PR code diff (lines starting with '+').


 The format we will use to present the PR code diff:
@ -124,13 +126,15 @@ The format we will use to present the PR code diff:

@@ ... @@ def func1():
 __new hunk__
-12  code line1 that remained unchanged in the PR
+11  unchanged code line0 in the PR
+12  unchanged code line1 in the PR
 13 +new code line2 added in the PR
-14  code line3 that remained unchanged in the PR
+14  unchanged code line3 in the PR
 __old hunk__
- code line1 that remained unchanged in the PR
-old code line2 that was removed in the PR
- code line3 that remained unchanged in the PR
+ unchanged code line0
+ unchanged code line1
+-old code line2 removed in the PR
+ unchanged code line3

@@ ... @@ def func2():
 __new hunk__
@ -142,15 +146,15 @@ __old hunk__
 ## file: 'src/file2.py'
 ...
 ======
- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
- We also added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
+
+- In this format, we separate each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
+- We also added line numbers for the '__new hunk__' code, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and should only used for reference.
 - Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \
-Suggestions should always focus on ways to improve the new code lines introduced in the PR, meaning lines in the '__new hunk__' sections that begin with a '+' symbol (after the line numbers). The '__old hunk__' sections code is for context and reference only.


 Specific instructions for generating code suggestions:
 - Provide in total up to {{ num_code_suggestions }} code suggestions. The suggestions should be diverse and insightful.
- The suggestions should focus on improving the new code introduced the PR, meaning lines from '__new hunk__' sections, starting with '+' (after the line numbers).
+- The suggestions should focus on improving only the new code introduced the PR, meaning lines from '__new hunk__' sections, starting with '+' (after the line numbers).
 - Prioritize suggestions that address possible issues, major problems, and bugs in the PR code.
 - Don't suggest to add docstring, type hints, or comments, or to remove unused imports.
 - Provide the exact line numbers range (inclusive) for each suggestion. Use the line numbers from the '__new hunk__' sections.
@ -173,7 +177,7 @@ The output must be a YAML object equivalent to type $PRCodeSuggestions, accordin
 class CodeSuggestion(BaseModel):
    relevant_file: str = Field(description="The full file path of the relevant file")
    language: str = Field(description="the programming language of the relevant file")
-    suggestion_content: str = Field(description="an actionable suggestion for meaningfully improving the new code introduced in the PR. Don't present here actual code snippets, just the suggestion. Be short and concise ")
+    suggestion_content: str = Field(description="an actionable suggestion for meaningfully improving the new code introduced in the PR. Don't present here actual code snippets, just the suggestion. Be short and concise")
    existing_code: str = Field(description="a short code snippet, demonstrating the relevant code lines from a '__new hunk__' section. It must be without line numbers. Quote only full code lines, not partial ones. Use abbreviations ("...") of full lines if needed")
    improved_code: str = Field(description="a new code snippet, that can be used to replace the relevant 'existing_code' lines in '__new hunk__' code after applying the suggestion")
    one_sentence_summary: str = Field(description="a short summary of the suggestion action, in a single sentence. Focus on the 'what'. Be general, and avoid method or variable names.")
--- a/pr_agent/settings/pr_reviewer_prompts.toml
+++ b/pr_agent/settings/pr_reviewer_prompts.toml
@ -5,7 +5,7 @@ Your task is to provide constructive and concise feedback for the PR, and also p
 {%- else %}
 Your task is to provide constructive and concise feedback for the PR.
 {%- endif %}
-The review should focus on new code added in the PR diff (lines starting with '+')
+The review should focus on new code added in the PR code diff (lines starting with '+')


 The format we will use to present the PR code diff:
@ -14,13 +14,15 @@ The format we will use to present the PR code diff:

@@ ... @@ def func1():
 __new hunk__
-12  code line1 that remained unchanged in the PR
+11  unchanged code line0 in the PR
+12  unchanged code line1 in the PR
 13 +new code line2 added in the PR
-14  code line3 that remained unchanged in the PR
+14  unchanged code line3 in the PR
 __old hunk__
- code line1 that remained unchanged in the PR
-old code line2 that was removed in the PR
- code line3 that remained unchanged in the PR
+ unchanged code line0
+ unchanged code line1
+-old code line2 removed in the PR
+ unchanged code line3

@@ ... @@ def func2():
 __new hunk__
@ -32,10 +34,11 @@ __old hunk__
 ## file: 'src/file2.py'
 ...
 ======
+
 - In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
- We also added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
+- We also added line numbers for the '__new hunk__' code, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and should only used for reference.
 - Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \
-The review should focus on new code added in the PR diff (lines starting with '+')
+ The review should address new code added in the PR code diff (lines starting with '+')
 - When quoting variables or names from the code, use backticks (`) instead of single quote (').

 {%- if num_code_suggestions > 0 %}
@ -46,7 +49,7 @@ Code suggestions guidelines:
 - Focus on important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningful code improvements, like performance, vulnerability, modularity, and best practices.
 - Avoid making suggestions that have already been implemented in the PR code. For example, if you want to add logs, or change a variable to const, or anything else, make sure it isn't already in the PR code.
 - Don't suggest to add docstring, type hints, or comments.
- Suggestions should focus on the new code added in the PR diff (lines starting with '+')
+- Suggestions should address the new code added in the PR diff (lines starting with '+')
 {%- endif %}

 {%- if extra_instructions %}
@ -98,7 +101,7 @@ class Review(BaseModel):
 {%- endif %}
    key_issues_to_review: List[KeyIssuesComponentLink] = Field("A list of bugs, issue or major performance concerns introduced in this PR, which the PR reviewer should further investigate")
 {%- if require_security_review %}
-    security_concerns: str = Field(description="does this PR code introduce possible vulnerabilities such as exposure of sensitive information (e.g., API keys, secrets, passwords), or security concerns like SQL injection, XSS, CSRF, and others ? Answer 'No' if there are no possible issues. If there are security concerns or issues, start your answer with a short header, such as: 'Sensitive information exposure: ...', 'SQL injection: ...' etc. Explain your answer. Be specific and give examples if possible")
+    security_concerns: str = Field(description="Does this PR code introduce possible vulnerabilities such as exposure of sensitive information (e.g., API keys, secrets, passwords), or security concerns like SQL injection, XSS, CSRF, and others ? Answer 'No' (without explaining why) if there are no possible issues. If there are security concerns or issues, start your answer with a short header, such as: 'Sensitive information exposure: ...', 'SQL injection: ...' etc. Explain your answer. Be specific and give examples if possible")
 {%- endif %}
 {%- if require_can_be_split_review %}
    can_be_split: List[SubPR] = Field(min_items=0, max_items=3, description="Can this PR, which contains {{ num_pr_files }} changed files in total, be divided into smaller sub-PRs with distinct tasks that can be reviewed and merged independently, regardless of the order ? Make sure that the sub-PRs are indeed independent, with no code dependencies between them, and that each sub-PR represent a meaningful independent task. Output an empty list if the PR code does not need to be split.")
@ -180,7 +183,7 @@ code_feedback:
 Answer should be a valid YAML, and nothing else. Each YAML output MUST be after a newline, with proper indent, and block scalar indicator ('|')
 """

-user="""PR Info:
+user="""--PR Info--

 Title: '{{title}}'

@ -188,7 +191,7 @@ Branch: '{{branch}}'

 {%- if description %}

-Description:
+PR Description:
 ======
 {{ description|trim }}
 ======
@ -209,7 +212,7 @@ User answers:
 {%- endif %}


-The PR Diff:
+The PR code diff:
 ======
 {{ diff|trim }}
 ======