Merge pull request #352 from Codium-ai/tr/prompts

Refactoring Patch Extra Lines and Updating Prompts
2025-07-21 04:50:39 +08:00 · 2023-10-05 08:55:14 +03:00
parent d13a92515b f3e794e50b
commit d92f5284df
9 changed files with 51 additions and 15 deletions
--- a/Usage.md
+++ b/Usage.md
@ -261,6 +261,30 @@ All PR-Agent tools have a parameter called `extra_instructions`, that enables to
 /update_changelog --pr_update_changelog.extra_instructions="Make sure to update also the version ..."
 ```

+#### Patch Extra Lines
+By default, around any change in your PR, git patch provides 3 lines of context above and below the change.
+```
+@@ -12,5 +12,5 @@ def func1():
+ code line that already existed in the file...
+ code line that already existed in the file...
+ code line that already existed in the file....
+-code line that was removed in the PR
+new code line added in the PR
+ code line that already existed in the file...
+ code line that already existed in the file...
+ code line that already existed in the file...
+```
+
+For the `review`, `describe`, `ask` and `add_docs` tools, if the token budget allows, PR-Agent tries to increase the number of lines of context, via the parameter:
+```
+[config]
+patch_extra_lines=3
+```
+
+Increasing this number provides more context to the model, but will also increase the token budget.
+If the PR is too large (see [PR Compression strategy](./PR_COMPRESSION.md)), PR-Agent automatically sets this number to 0, using the original git patch.
+
+
 #### Azure DevOps provider
 To use Azure DevOps provider use the following settings in configuration.toml:
 ```
--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@ -21,7 +21,6 @@ MORE_MODIFIED_FILES_ = "More modified files:\n"

 OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD = 1000
 OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600
-PATCH_EXTRA_LINES = 3

 def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: str,
                add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False) -> str:
@ -44,8 +43,9 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s
    """

    if disable_extra_lines:
-        global PATCH_EXTRA_LINES
        PATCH_EXTRA_LINES = 0
+    else:
+        PATCH_EXTRA_LINES = get_settings().config.patch_extra_lines

    try:
        diff_files = git_provider.get_diff_files()
@ -57,8 +57,8 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s
    pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files)

    # generate a standard diff string, with patch extension
-    patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff(pr_languages, token_handler,
-                                                               add_line_numbers_to_hunks)
+    patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff(
+        pr_languages, token_handler, add_line_numbers_to_hunks, patch_extra_lines=PATCH_EXTRA_LINES)

    # if we are under the limit, return the full diff
    if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < MAX_TOKENS[model]:
@ -80,7 +80,8 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s

 def pr_generate_extended_diff(pr_languages: list,
                              token_handler: TokenHandler,
-                              add_line_numbers_to_hunks: bool) -> Tuple[list, int, list]:
+                              add_line_numbers_to_hunks: bool,
+                              patch_extra_lines: int = 0) -> Tuple[list, int, list]:
    """
    Generate a standard diff string with patch extension, while counting the number of tokens used and applying diff
    minimization techniques if needed.
@ -102,7 +103,7 @@ def pr_generate_extended_diff(pr_languages: list,
                continue

            # extend each patch with extra lines of context
-            extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES)
+            extended_patch = extend_patch(original_file_content_str, patch, num_lines=patch_extra_lines)
            full_extended_patch = f"\n\n## {file.filename}\n\n{extended_patch}\n"

            if add_line_numbers_to_hunks:
--- a/pr_agent/settings/configuration.toml
+++ b/pr_agent/settings/configuration.toml
@ -10,6 +10,7 @@ use_repo_settings_file=true
 ai_timeout=180
 max_description_tokens = 500
 max_commits_tokens = 500
+patch_extra_lines = 3
 secret_provider="google_cloud_storage"
 cli_mode=false

--- a/pr_agent/settings/pr_add_docs.toml
+++ b/pr_agent/settings/pr_add_docs.toml
@ -42,7 +42,9 @@ Specific instructions:
 {%- if extra_instructions %}

 Extra instructions from the user:
+'
 {{ extra_instructions }}
+'
 {%- endif %}

 You must use the following YAML schema to format your answer:
--- a/pr_agent/settings/pr_code_suggestions_prompts.toml
+++ b/pr_agent/settings/pr_code_suggestions_prompts.toml
@ -1,6 +1,6 @@
 [pr_code_suggestions_prompt]
 system="""You are a language model called PR-Code-Reviewer, that specializes in suggesting code improvements for Pull Request (PR).
-Your task is to provide meaningful and actionable code suggestions, to improve the new code presented in a PR.
+Your task is to provide meaningful and actionable code suggestions, to improve the new code presented in a PR (the '+' lines in the diff).

 Example for a PR Diff input:
 '
@ -31,14 +31,13 @@ __old hunk__
 '

 Specific instructions:
- Provide up to {{ num_code_suggestions }} code suggestions.
+- Provide up to {{ num_code_suggestions }} code suggestions. Try to provide diverse and insightful suggestions.
 - Prioritize suggestions that address major problems, issues and bugs in the code.
  As a second priority, suggestions should focus on best practices, code readability, maintainability, enhancments, performance, and other aspects.
-  Don't suggest to add docstring, type hints, or comments.
-  Try to provide diverse and insightful suggestions.
+- Don't suggest to add docstring, type hints, or comments.
 - Suggestions should refer only to code from the '__new hunk__' sections, and focus on new lines of code (lines starting with '+').
-  Avoid making suggestions that have already been implemented in the PR code. For example, if you want to add logs, or change a variable to const, or anything else, make sure it isn't already in the '__new hunk__' code.
-  For each suggestion, make sure to take into consideration also the context, meaning the lines before and after the relevant code.
+- Avoid making suggestions that have already been implemented in the PR code. For example, if you want to add logs, or change a variable to const, or anything else, make sure it isn't already in the '__new hunk__' code.
+- For each suggestion, make sure to take into consideration also the context, meaning the lines before and after the relevant code.
 - Provide the exact line numbers range (inclusive) for each issue.
 - Assume there is additional relevant code, that is not included in the diff.

@ -46,7 +45,9 @@ Specific instructions:
 {%- if extra_instructions %}

 Extra instructions from the user:
+'
 {{ extra_instructions }}
+'
 {%- endif %}

 You must use the following YAML schema to format your answer:
--- a/pr_agent/settings/pr_description_prompts.toml
+++ b/pr_agent/settings/pr_description_prompts.toml
@ -7,7 +7,9 @@ Your task is to provide full description of the PR content.
 {%- if extra_instructions %}

 Extra instructions from the user:
+'
 {{ extra_instructions }}
+'
 {% endif %}

 You must use the following YAML schema to format your answer:
--- a/pr_agent/settings/pr_reviewer_prompts.toml
+++ b/pr_agent/settings/pr_reviewer_prompts.toml
@ -35,7 +35,9 @@ The review should focus on new code added in the PR (lines starting with '+'), a
 {%- if extra_instructions %}

 Extra instructions from the user:
+'
 {{ extra_instructions }}
+'
 {% endif %}

 You must use the following YAML schema to format your answer:
@ -129,8 +131,7 @@ PR Feedback:
  Security concerns:
    type: string
    description: >-
-      yes\\no question: does this PR code introduce possible security concerns or
-      issues, like SQL injection, XSS, CSRF, and others ? If answered 'yes',explain your answer shortly
+      yes\\no question: does this PR code introduce possible vulnerabilities such as exposure of sensitive information (e.g., API keys, secrets, passwords), or security concerns like SQL injection, XSS, CSRF, and others ? If answered 'yes', explain your answer briefly.
 {%- endif %}
 ```

@ -196,7 +197,9 @@ Here are questions to better understand the PR. Use the answers to provide bette
 {{question_str|trim}}

 User answers:
+'
 {{answer_str|trim}}
+'
 ######
 {%- endif %}

--- a/pr_agent/settings/pr_update_changelog_prompts.toml
+++ b/pr_agent/settings/pr_update_changelog_prompts.toml
@ -8,7 +8,9 @@ Your task is to update the CHANGELOG.md file of the project, to shortly summariz
 {%- if extra_instructions %}

 Extra instructions from the user:
+'
 {{ extra_instructions }}
+'
 {%- endif %}
 """

--- a/pr_agent/tools/pr_add_docs.py
+++ b/pr_agent/tools/pr_add_docs.py
@ -68,7 +68,7 @@ class PRAddDocs:
                                        self.token_handler,
                                        model,
                                        add_line_numbers_to_hunks=True,
-                                        disable_extra_lines=True)
+                                        disable_extra_lines=False)

        logging.info('Getting AI prediction...')
        self.prediction = await self._get_prediction(model)