review with links

2025-07-21 04:50:39 +08:00 · 2024-07-14 08:53:53 +03:00
parent ce35d2c313
commit 5d6e1de157
4 changed files with 124 additions and 34 deletions
--- a/pr_agent/algo/utils.py
+++ b/pr_agent/algo/utils.py
@ -1,5 +1,6 @@
 from __future__ import annotations

+import copy
 import difflib
 import json
 import os
@ -44,7 +45,7 @@ def get_setting(key: str) -> Any:
        return global_settings.get(key, None)


-def emphasize_header(text: str, only_markdown=False) -> str:
+def emphasize_header(text: str, only_markdown=False, reference_link=None) -> str:
    try:
        # Finding the position of the first occurrence of ": "
        colon_position = text.find(": ")
@ -53,9 +54,15 @@ def emphasize_header(text: str, only_markdown=False) -> str:
        if colon_position != -1:
            # Everything before the colon (inclusive) is wrapped in <strong> tags
            if only_markdown:
-                transformed_string = f"**{text[:colon_position + 1]}**\n" + text[colon_position + 1:]
+                if reference_link:
+                    transformed_string = f"[**{text[:colon_position + 1]}**]({reference_link})\n" + text[colon_position + 1:]
+                else:
+                    transformed_string = f"**{text[:colon_position + 1]}**\n" + text[colon_position + 1:]
            else:
-                transformed_string = "<strong>" + text[:colon_position + 1] + "</strong>" +'<br>' + text[colon_position + 1:]
+                if reference_link:
+                    transformed_string = f"<strong><a href='{reference_link}'>{text[:colon_position + 1]}</a></strong><br>" + text[colon_position + 1:]
+                else:
+                    transformed_string = "<strong>" + text[:colon_position + 1] + "</strong>" +'<br>' + text[colon_position + 1:]
        else:
            # If there's no ": ", return the original string
            transformed_string = text
@ -77,7 +84,10 @@ def unique_strings(input_list: List[str]) -> List[str]:
            seen.add(item)
    return unique_list

-def convert_to_markdown_v2(output_data: dict, gfm_supported: bool = True, incremental_review=None) -> str:
+def convert_to_markdown_v2(output_data: dict,
+                           gfm_supported: bool = True,
+                           incremental_review=None,
+                           git_provider=None) -> str:
    """
    Convert a dictionary of data into markdown format.
    Args:
@ -113,12 +123,13 @@ def convert_to_markdown_v2(output_data: dict, gfm_supported: bool = True, increm

    for key, value in output_data['review'].items():
        if value is None or value == '' or value == {} or value == []:
-            if key.lower() != 'can_be_split':
+            if key.lower() != 'can_be_split' and key.lower() != 'key_issues_to_review':
                continue
        key_nice = key.replace('_', ' ').capitalize()
        emoji = emojis.get(key_nice, "")
        if 'Estimated effort to review' in key_nice:
            key_nice = 'Estimated effort to review'
+            value = str(value).strip()
            if value.isnumeric():
                value_int = int(value)
            else:
@ -179,7 +190,7 @@ def convert_to_markdown_v2(output_data: dict, gfm_supported: bool = True, increm
                markdown_text += process_can_be_split(emoji, value)
                markdown_text += f"</td></tr>\n"
        elif 'key issues to review' in key_nice.lower():
-            value = value.strip()
+            # value is a list of issues
            if is_value_no(value):
                if gfm_supported:
                    markdown_text += f"<tr><td>"
@ -188,20 +199,46 @@ def convert_to_markdown_v2(output_data: dict, gfm_supported: bool = True, increm
                else:
                    markdown_text += f"### {emoji} No key issues to review\n\n"
            else:
-                issues = value.split('\n- ')
-                for i, _ in enumerate(issues):
-                    issues[i] = issues[i].strip().strip('-').strip()
-                issues = unique_strings(issues) # remove duplicates
+                # issues = value.split('\n- ')
+                issues =value
+                # for i, _ in enumerate(issues):
+                #     issues[i] = issues[i].strip().strip('-').strip()
                if gfm_supported:
                    markdown_text += f"<tr><td>"
                    markdown_text += f"{emoji}&nbsp;<strong>{key_nice}</strong><br><br>\n\n"
                else:
-                    markdown_text += f"### {emoji} Key issues to review:\n\n"
+                    markdown_text += f"### {emoji} Key issues to review\n\n#### \n"
                for i, issue in enumerate(issues):
-                    if not issue:
-                        continue
-                    issue = emphasize_header(issue, only_markdown=True)
-                    markdown_text += f"{issue}\n\n"
+                    try:
+                        if not issue:
+                            continue
+                        relevant_file = issue.get('relevant_file', '').strip()
+                        issue_header = issue.get('issue_header', '').strip()
+                        issue_content = issue.get('issue_content', '').strip()
+                        start_line = int(str(issue.get('start_line', 0)).strip())
+                        end_line = int(str(issue.get('end_line', 0)).strip())
+                        reference_link = git_provider.get_line_link(relevant_file, start_line, end_line)
+
+                        if gfm_supported:
+                            if get_settings().pr_reviewer.extra_issue_links:
+                                issue_content_linked =copy.deepcopy(issue_content)
+                                referenced_variables_list = issue.get('referenced_variables', [])
+                                for component in referenced_variables_list:
+                                    name = component['variable_name'].strip().strip('`')
+
+                                    ind = issue_content.find(name)
+                                    if ind != -1:
+                                        reference_link_component = git_provider.get_line_link(relevant_file, component['relevant_line'], component['relevant_line'])
+                                        issue_content_linked = issue_content_linked[:ind-1] + f"[`{name}`]({reference_link_component})" + issue_content_linked[ind+len(name)+1:]
+                                    else:
+                                        get_logger().info(f"Failed to find variable in issue content: {component['variable_name'].strip()}")
+                                issue_content = issue_content_linked
+                            issue_str = f"<a href='{reference_link}'><strong>{issue_header}</strong></a><br>{issue_content}"
+                        else:
+                            issue_str = f"[**{issue_header}**]({reference_link})\n\n{issue_content}\n\n"
+                        markdown_text += f"{issue_str}\n\n"
+                    except Exception as e:
+                        get_logger().exception(f"Failed to process key issues to review: {e}")
                if gfm_supported:
                    markdown_text += f"</td></tr>\n"
        else:
@ -536,6 +573,7 @@ def load_yaml(response_text: str, keys_fix_yaml: List[str] = [], first_key="", l
    return data


+
 def try_fix_yaml(response_text: str,
                 keys_fix_yaml: List[str] = [],
                 first_key="",
@ -558,14 +596,14 @@ def try_fix_yaml(response_text: str,
    except:
        get_logger().info(f"Failed to parse AI prediction after adding |-\n")

-    # second fallback - try to extract only range from first ```yaml to ```
+    # second fallback - try to extract only range from first ```yaml to ````
    snippet_pattern = r'```(yaml)?[\s\S]*?```'
    snippet = re.search(snippet_pattern, '\n'.join(response_text_lines_copy))
    if snippet:
        snippet_text = snippet.group()
        try:
            data = yaml.safe_load(snippet_text.removeprefix('```yaml').rstrip('`'))
-            get_logger().info(f"Successfully parsed AI prediction after extracting yaml snippet with second fallback")
+            get_logger().info(f"Successfully parsed AI prediction after extracting yaml snippet")
            return data
        except:
            pass
@ -580,6 +618,7 @@ def try_fix_yaml(response_text: str,
    except:
        pass

+
    # forth fallback - try to extract yaml snippet by 'first_key' and 'last_key'
    # note that 'last_key' can be in practice a key that is not the last key in the yaml snippet.
    # it just needs to be some inner key, so we can look for newlines after it
@ -885,7 +924,7 @@ def show_relevant_configurations(relevant_section: str) -> str:
    return markdown_text

 def is_value_no(value):
-    if value is None:
+    if not value:
        return True
    value_str = str(value).strip().lower()
    if value_str == 'no' or value_str == 'none' or value_str == 'false':
--- a/pr_agent/settings/configuration.toml
+++ b/pr_agent/settings/configuration.toml
@ -30,6 +30,7 @@ require_tests_review=true
 require_estimate_effort_to_review=true
 require_can_be_split_review=false
 require_security_review=true
+extra_issue_links=false
 # soc2
 require_soc2_ticket=false
 soc2_ticket_prompt="Does the PR description include a link to ticket in a project management system (e.g., Jira, Asana, Trello, etc.) ?"
--- a/pr_agent/settings/pr_reviewer_prompts.toml
+++ b/pr_agent/settings/pr_reviewer_prompts.toml
@ -7,39 +7,51 @@ Your task is to provide constructive and concise feedback for the PR.
 {%- endif %}
 The review should focus on new code added in the PR diff (lines starting with '+')

-Example PR Diff:
+
+The format we will use to present the PR code diff:
 ======
 ## file: 'src/file1.py'

-@@ -12,5 +12,5 @@ def func1():
-code line 1 that remained unchanged in the PR
-code line 2 that remained unchanged in the PR
-code line that was removed in the PR
-+code line added in the PR
-code line 3 that remained unchanged in the PR
+@@ ... @@ def func1():
+__new hunk__
+12  code line1 that remained unchanged in the PR
+13 +new hunk code line2 added in the PR
+14  code line3 that remained unchanged in the PR
+__old hunk__
+ code line1 that remained unchanged in the PR
+-old hunk code line2 that was removed in the PR
+ code line3 that remained unchanged in the PR

@@ ... @@ def func2():
+__new hunk__
+...
+__old hunk__
 ...


 ## file: 'src/file2.py'
 ...
 ======
+- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed.
+- We also added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
+- Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \
+The review should focus on new code added in the PR diff (lines starting with '+')
+- When quoting variables or names from the code, use backticks (`) instead of single quote (').

 {%- if num_code_suggestions > 0 %}


 Code suggestions guidelines:
 - Provide up to {{ num_code_suggestions }} code suggestions. Try to provide diverse and insightful suggestions.
- Focus on important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningful code improvements like performance, vulnerability, modularity, and best practices.
+- Focus on important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningful code improvements, like performance, vulnerability, modularity, and best practices.
 - Avoid making suggestions that have already been implemented in the PR code. For example, if you want to add logs, or change a variable to const, or anything else, make sure it isn't already in the PR code.
 - Don't suggest to add docstring, type hints, or comments.
 - Suggestions should focus on the new code added in the PR diff (lines starting with '+')
- When quoting variables or names from the code, use backticks (`) instead of single quote (').
 {%- endif %}

 {%- if extra_instructions %}

+
 Extra instructions from the user:
 ======
 {{ extra_instructions }}
@ -55,6 +67,22 @@ class SubPR(BaseModel):
    title: str = Field(description="Short and concise title for an independent and meaningful sub-PR, composed only from the relevant files")
 {%- endif %}

+class KeyIssuesComponentLink(BaseModel):
+    relevant_file: str = Field(description="The full file path of the relevant file")
+    issue_header: str = Field(description="one or two word title for the the issue. For example: 'Possible Bug', 'Performance Issue', 'Code Smell', etc.")
+    issue_content: str = Field(description="a short and concise description of the issue that needs to be reviewed")
+    start_line: int = Field(description="the start line that corresponds to this issue in the relevant file")
+    end_line: int = Field(description="the end line that corresponds to this issue in the relevant file")
+{%- if extra_issue_links %}
+    referenced_variables: List[Refs] = Field(description="a list of relevant variables or names that appear in the 'issue_content' output. For each variable, output is name, and the line number where it appears in the relevant file")
+{% endif %}
+
+{%- if extra_issue_links %}
+class Refs(BaseModel):
+    variable_name: str = Field(description="the name of a variable or name that appears in the relevant 'issue_content' output.")
+    relevant_line: int = Field(description="the line number where the variable or name appears in the relevant file")
+{%- endif %}
+
 class Review(BaseModel):
 {%- if require_estimate_effort_to_review %}
    estimated_effort_to_review_[1-5]: int = Field(description="Estimate, on a scale of 1-5 (inclusive), the time and effort required to review this PR by an experienced and knowledgeable developer. 1 means short and easy review , 5 means long and hard review. Take into account the size, complexity, quality, and the needed changes of the PR code diff.")
@ -68,7 +96,7 @@ class Review(BaseModel):
 {%- if question_str %}
    insights_from_user_answers: str = Field(description="shortly summarize the insights you gained from the user's answers to the questions")
 {%- endif %}
-    key_issues_to_review: str = Field(description="Does this PR code introduce issues, bugs, or major performance concerns, which the PR reviewer should further investigate ? If there are no apparent issues, respond with 'None'. If there are any issues, describe them briefly. Use bullet points if more than one issue. Be specific, and provide examples if possible. Start each bullet point with a short specific header, such as: "- Possible Bug: ...", etc.")
+    key_issues_to_review: List[KeyIssuesComponentLink] = Field("A list of bugs, issue or major performance concerns introduced in this PR, which the PR reviewer should further investigate")
 {%- if require_security_review %}
    security_concerns: str = Field(description="does this PR code introduce possible vulnerabilities such as exposure of sensitive information (e.g., API keys, secrets, passwords), or security concerns like SQL injection, XSS, CSRF, and others ? Answer 'No' if there are no possible issues. If there are security concerns or issues, start your answer with a short header, such as: 'Sensitive information exposure: ...', 'SQL injection: ...' etc. Explain your answer. Be specific and give examples if possible")
 {%- endif %}
@ -78,8 +106,8 @@ class Review(BaseModel):
 {%- if num_code_suggestions > 0 %}

 class CodeSuggestion(BaseModel):
-    relevant_file: str = Field(description="The full file path of the relevant file.")
-    language: str = Field(description="The programming language of the relevant file.")
+    relevant_file: str = Field(description="The full file path of the relevant file")
+    language: str = Field(description="The programming language of the relevant file")
    suggestion: str = Field(description="a concrete suggestion for meaningfully improving the new PR code. Also describe how, specifically, the suggestion can be applied to new PR code. Add tags with importance measure that matches each suggestion ('important' or 'medium'). Do not make suggestions for updating or adding docstrings, renaming PR title and description, or linter like.")
    relevant_line: str = Field(description="a single code line taken from the relevant file, to which the suggestion applies. The code line should start with a '+'. Make sure to output the line exactly as it appears in the relevant file")
 {%- endif %}
@ -90,6 +118,7 @@ class PRReview(BaseModel):
    code_feedback: List[CodeSuggestion]
 {%- else %}

+
 class PRReview(BaseModel):
    review: Review
 {%- endif %}
@ -108,8 +137,22 @@ review:
 {%- endif %}
  relevant_tests: |
    No
-  key_issues_to_review: |
-    ...
+  key_issues_to_review:
+    - relevant_file: |
+        directory/xxx.py
+      issue_header: |
+        Possible Bug
+      issue_content: |
+        ...
+      start_line: 12
+      end_line: 14
+{%- if extra_issue_links %}
+      referenced_variables:
+      - variable_name: |
+          ...
+        relevant_line: 13
+{%- endif %}
+    - ...
  security_concerns: |
    No
 {%- if require_can_be_split_review %}
@ -120,6 +163,7 @@ review:
    title: ...
  - ...
 {%- endif %}
+
 {%- if num_code_suggestions > 0 %}
 code_feedback
 - relevant_file: |
--- a/pr_agent/tools/pr_reviewer.py
+++ b/pr_agent/tools/pr_reviewer.py
@ -75,6 +75,7 @@ class PRReviewer:
            "commit_messages_str": self.git_provider.get_commit_messages(),
            "custom_labels": "",
            "enable_custom_labels": get_settings().config.enable_custom_labels,
+            "extra_issue_links": get_settings().pr_reviewer.extra_issue_links,
        }

        self.token_handler = TokenHandler(
@ -147,7 +148,12 @@ class PRReviewer:
            get_logger().error(f"Failed to review PR: {e}")

    async def _prepare_prediction(self, model: str) -> None:
-        self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model)
+        self.patches_diff = get_pr_diff(self.git_provider,
+                                        self.token_handler,
+                                        model,
+                                        add_line_numbers_to_hunks=True,
+                                        disable_extra_lines=True,)
+
        if self.patches_diff:
            get_logger().debug(f"PR diff", diff=self.patches_diff)
            self.prediction = await self._get_prediction(model)
@ -234,7 +240,7 @@ class PRReviewer:
            incremental_review_markdown_text = f"Starting from commit {last_commit_url}"

        markdown_text = convert_to_markdown_v2(data, self.git_provider.is_supported("gfm_markdown"),
-                                            incremental_review_markdown_text)
+                                            incremental_review_markdown_text, git_provider=self.git_provider)

        # Add help text if gfm_markdown is supported
        if self.git_provider.is_supported("gfm_markdown") and get_settings().pr_reviewer.enable_help_text: