Merge pull request #352 from Codium-ai/tr/prompts

Refactoring Patch Extra Lines and Updating Prompts
2025-07-21 04:50:39 +08:00 · 2023-10-05 08:55:14 +03:00
parent d13a92515b f3e794e50b
commit d92f5284df
9 changed files with 51 additions and 15 deletions
--- a/Usage.md
+++ b/Usage.md
@ -261,6 +261,30 @@ All PR-Agent tools have a parameter called `extra_instructions`, that enables to
 /update_changelog --pr_update_changelog.extra_instructions="Make sure to update also the version ..."
 ```
 #### Patch Extra Lines
 By default, around any change in your PR, git patch provides 3 lines of context above and below the change.
 ```
@@ -12,5 +12,5 @@ def func1():
 code line that already existed in the file...
 code line that already existed in the file...
 code line that already existed in the file....
 -code line that was removed in the PR
 +new code line added in the PR
 code line that already existed in the file...
 code line that already existed in the file...
 code line that already existed in the file...
 ```
 For the `review`, `describe`, `ask` and `add_docs` tools, if the token budget allows, PR-Agent tries to increase the number of lines of context, via the parameter:
 ```
 [config]
 patch_extra_lines=3
 ```
 Increasing this number provides more context to the model, but will also increase the token budget.
 If the PR is too large (see [PR Compression strategy](./PR_COMPRESSION.md)), PR-Agent automatically sets this number to 0, using the original git patch.
 #### Azure DevOps provider
 To use Azure DevOps provider use the following settings in configuration.toml:
 ```
--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@ -21,7 +21,6 @@ MORE_MODIFIED_FILES_ = "More modified files:\n"
 OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD = 1000
 OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600
 PATCH_EXTRA_LINES = 3
 def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: str,
                add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False) -> str:
@ -44,8 +43,9 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s
    """
    if disable_extra_lines:
        global PATCH_EXTRA_LINES
        PATCH_EXTRA_LINES = 0
    else:
        PATCH_EXTRA_LINES = get_settings().config.patch_extra_lines
    try:
        diff_files = git_provider.get_diff_files()
@ -57,8 +57,8 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s
    pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files)
    # generate a standard diff string, with patch extension
-    patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff(pr_languages, token_handler,
+    patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff(
-                                                               add_line_numbers_to_hunks)
+        pr_languages, token_handler, add_line_numbers_to_hunks, patch_extra_lines=PATCH_EXTRA_LINES)
    # if we are under the limit, return the full diff
    if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < MAX_TOKENS[model]:
@ -80,7 +80,8 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s
 def pr_generate_extended_diff(pr_languages: list,
                              token_handler: TokenHandler,
-                              add_line_numbers_to_hunks: bool) -> Tuple[list, int, list]:
+                              add_line_numbers_to_hunks: bool,
                              patch_extra_lines: int = 0) -> Tuple[list, int, list]:
    """
    Generate a standard diff string with patch extension, while counting the number of tokens used and applying diff
    minimization techniques if needed.
@ -102,7 +103,7 @@ def pr_generate_extended_diff(pr_languages: list,
                continue
            # extend each patch with extra lines of context
-            extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES)
+            extended_patch = extend_patch(original_file_content_str, patch, num_lines=patch_extra_lines)
            full_extended_patch = f"\n\n## {file.filename}\n\n{extended_patch}\n"
            if add_line_numbers_to_hunks:
--- a/pr_agent/settings/configuration.toml
+++ b/pr_agent/settings/configuration.toml
@ -10,6 +10,7 @@ use_repo_settings_file=true
 ai_timeout=180
 max_description_tokens = 500
 max_commits_tokens = 500
 patch_extra_lines = 3
 secret_provider="google_cloud_storage"
 cli_mode=false
--- a/pr_agent/settings/pr_add_docs.toml
+++ b/pr_agent/settings/pr_add_docs.toml
@ -42,7 +42,9 @@ Specific instructions:
 {%- if extra_instructions %}
 Extra instructions from the user:
 '
 {{ extra_instructions }}
 '
 {%- endif %}
 You must use the following YAML schema to format your answer:
--- a/pr_agent/settings/pr_code_suggestions_prompts.toml
+++ b/pr_agent/settings/pr_code_suggestions_prompts.toml
@ -1,6 +1,6 @@
 [pr_code_suggestions_prompt]
 system="""You are a language model called PR-Code-Reviewer, that specializes in suggesting code improvements for Pull Request (PR).
-Your task is to provide meaningful and actionable code suggestions, to improve the new code presented in a PR.
+Your task is to provide meaningful and actionable code suggestions, to improve the new code presented in a PR (the '+' lines in the diff).
 Example for a PR Diff input:
 '
@ -31,14 +31,13 @@ __old hunk__
 '
 Specific instructions:
- Provide up to {{ num_code_suggestions }} code suggestions.
+- Provide up to {{ num_code_suggestions }} code suggestions. Try to provide diverse and insightful suggestions.
 - Prioritize suggestions that address major problems, issues and bugs in the code.
  As a second priority, suggestions should focus on best practices, code readability, maintainability, enhancments, performance, and other aspects.
-  Don't suggest to add docstring, type hints, or comments.
+- Don't suggest to add docstring, type hints, or comments.
  Try to provide diverse and insightful suggestions.
 - Suggestions should refer only to code from the '__new hunk__' sections, and focus on new lines of code (lines starting with '+').
-  Avoid making suggestions that have already been implemented in the PR code. For example, if you want to add logs, or change a variable to const, or anything else, make sure it isn't already in the '__new hunk__' code.
+- Avoid making suggestions that have already been implemented in the PR code. For example, if you want to add logs, or change a variable to const, or anything else, make sure it isn't already in the '__new hunk__' code.
-  For each suggestion, make sure to take into consideration also the context, meaning the lines before and after the relevant code.
+- For each suggestion, make sure to take into consideration also the context, meaning the lines before and after the relevant code.
 - Provide the exact line numbers range (inclusive) for each issue.
 - Assume there is additional relevant code, that is not included in the diff.
@ -46,7 +45,9 @@ Specific instructions:
 {%- if extra_instructions %}
 Extra instructions from the user:
 '
 {{ extra_instructions }}
 '
 {%- endif %}
 You must use the following YAML schema to format your answer:
--- a/pr_agent/settings/pr_description_prompts.toml
+++ b/pr_agent/settings/pr_description_prompts.toml
@ -7,7 +7,9 @@ Your task is to provide full description of the PR content.
 {%- if extra_instructions %}
 Extra instructions from the user:
 '
 {{ extra_instructions }}
 '
 {% endif %}
 You must use the following YAML schema to format your answer:
--- a/pr_agent/settings/pr_reviewer_prompts.toml
+++ b/pr_agent/settings/pr_reviewer_prompts.toml
@ -35,7 +35,9 @@ The review should focus on new code added in the PR (lines starting with '+'), a
 {%- if extra_instructions %}
 Extra instructions from the user:
 '
 {{ extra_instructions }}
 '
 {% endif %}
 You must use the following YAML schema to format your answer:
@ -129,8 +131,7 @@ PR Feedback:
  Security concerns:
    type: string
    description: >-
-      yes\\no question: does this PR code introduce possible security concerns or
+      yes\\no question: does this PR code introduce possible vulnerabilities such as exposure of sensitive information (e.g., API keys, secrets, passwords), or security concerns like SQL injection, XSS, CSRF, and others ? If answered 'yes', explain your answer briefly.
      issues, like SQL injection, XSS, CSRF, and others ? If answered 'yes',explain your answer shortly
 {%- endif %}
 ```
@ -196,7 +197,9 @@ Here are questions to better understand the PR. Use the answers to provide bette
 {{question_str|trim}}
 User answers:
 '
 {{answer_str|trim}}
 '
 ######
 {%- endif %}
--- a/pr_agent/settings/pr_update_changelog_prompts.toml
+++ b/pr_agent/settings/pr_update_changelog_prompts.toml
@ -8,7 +8,9 @@ Your task is to update the CHANGELOG.md file of the project, to shortly summariz
 {%- if extra_instructions %}
 Extra instructions from the user:
 '
 {{ extra_instructions }}
 '
 {%- endif %}
 """
--- a/pr_agent/tools/pr_add_docs.py
+++ b/pr_agent/tools/pr_add_docs.py
@ -68,7 +68,7 @@ class PRAddDocs:
                                        self.token_handler,
                                        model,
                                        add_line_numbers_to_hunks=True,
-                                        disable_extra_lines=True)
+                                        disable_extra_lines=False)
        logging.info('Getting AI prediction...')
        self.prediction = await self._get_prediction(model)