enable ai_metadata

2025-07-21 04:50:39 +08:00 · 2024-09-07 17:25:05 +03:00
parent 24f7e8622f
commit 8706f643ef
32 changed files with 338 additions and 117 deletions
--- a/pr_agent/algo/git_patch_processing.py
+++ b/pr_agent/algo/git_patch_processing.py
@ -243,7 +243,7 @@ __old hunk__
    if hasattr(file, 'edit_type') and file.edit_type == EDIT_TYPE.DELETED:
        return f"\n\n## file '{file.filename.strip()}' was deleted\n"

-    patch_with_lines_str = f"\n\n## file: '{file.filename.strip()}'\n"
+    patch_with_lines_str = f"\n\n## File: '{file.filename.strip()}'\n"
    patch_lines = patch.splitlines()
    RE_HUNK_HEADER = re.compile(
        r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
@ -319,7 +319,7 @@ __old hunk__

 def extract_hunk_lines_from_patch(patch: str, file_name, line_start, line_end, side) -> tuple[str, str]:

-    patch_with_lines_str = f"\n\n## file: '{file_name.strip()}'\n\n"
+    patch_with_lines_str = f"\n\n## File: '{file_name.strip()}'\n\n"
    selected_lines = ""
    patch_lines = patch.splitlines()
    RE_HUNK_HEADER = re.compile(
--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@ -200,6 +200,10 @@ def pr_generate_extended_diff(pr_languages: list,
            if add_line_numbers_to_hunks:
                full_extended_patch = convert_to_hunks_with_lines_numbers(extended_patch, file)

+            # add AI-summary metadata to the patch
+            if file.ai_file_summary and  get_settings().get("config.enable_ai_metadata", False):
+                full_extended_patch = add_ai_summary_top_patch(file, full_extended_patch)
+
            patch_tokens = token_handler.count_tokens(full_extended_patch)
            file.tokens = patch_tokens
            total_tokens += patch_tokens
@ -239,6 +243,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
        if convert_hunks_to_line_numbers:
            patch = convert_to_hunks_with_lines_numbers(patch, file)

+        ## add AI-summary metadata to the patch (disabled, since we are in the compressed diff)
+        # if file.ai_file_summary and get_settings().config.get('config.is_auto_command', False):
+        #     patch = add_ai_summary_top_patch(file, patch)
+
        new_patch_tokens = token_handler.count_tokens(patch)
        file_dict[file.filename] = {'patch': patch, 'tokens': new_patch_tokens, 'edit_type': file.edit_type}

@ -304,7 +312,7 @@ def generate_full_patch(convert_hunks_to_line_numbers, file_dict, max_tokens_mod

        if patch:
            if not convert_hunks_to_line_numbers:
-                patch_final = f"\n\n## file: '{filename.strip()}\n\n{patch.strip()}\n'"
+                patch_final = f"\n\n## File: '{filename.strip()}\n\n{patch.strip()}\n'"
            else:
                patch_final = "\n\n" + patch.strip()
            patches.append(patch_final)
@ -432,6 +440,9 @@ def get_pr_multi_diffs(git_provider: GitProvider,
            continue

        patch = convert_to_hunks_with_lines_numbers(patch, file)
+        # add AI-summary metadata to the patch
+        if file.ai_file_summary and get_settings().get("config.enable_ai_metadata", False):
+            patch = add_ai_summary_top_patch(file, patch)
        new_patch_tokens = token_handler.count_tokens(patch)

        if patch and (token_handler.prompt_tokens + new_patch_tokens) > get_max_tokens(
@ -479,3 +490,33 @@ def get_pr_multi_diffs(git_provider: GitProvider,
        final_diff_list.append(final_diff)

    return final_diff_list
+
+
+def add_ai_metadata_to_diff_files(git_provider, pr_description_files):
+    """
+    Adds AI metadata to the diff files based on the PR description files (FilePatchInfo.ai_file_summary).
+    """
+    diff_files = git_provider.get_diff_files()
+    for file in diff_files:
+        filename = file.filename.strip()
+        found = False
+        for pr_file in pr_description_files:
+            if filename == pr_file['full_file_name'].strip():
+                file.ai_file_summary = pr_file
+                found = True
+                break
+        if not found:
+            get_logger().info(f"File {filename} not found in the PR description files",
+                              artifacts=pr_description_files)
+
+
+def add_ai_summary_top_patch(file, full_extended_patch):
+    # below every instance of '## File: ...' in the patch, add the ai-summary metadata
+    full_extended_patch_lines = full_extended_patch.split("\n")
+    for i, line in enumerate(full_extended_patch_lines):
+        if line.startswith("## File:") or line.startswith("## file:"):
+            full_extended_patch_lines.insert(i + 1,
+                                             f"### AI-generated file summary:\n{file.ai_file_summary['long_summary']}")
+            break
+    full_extended_patch = "\n".join(full_extended_patch_lines)
+    return full_extended_patch
--- a/pr_agent/algo/types.py
+++ b/pr_agent/algo/types.py
@ -21,3 +21,4 @@ class FilePatchInfo:
    old_filename: str = None
    num_plus_lines: int = -1
    num_minus_lines: int = -1
+    ai_file_summary: str = None
--- a/pr_agent/algo/utils.py
+++ b/pr_agent/algo/utils.py
@ -1,4 +1,5 @@
 from __future__ import annotations
+import html2text

 import html
 import copy
@ -214,19 +215,6 @@ def convert_to_markdown_v2(output_data: dict,
                        reference_link = git_provider.get_line_link(relevant_file, start_line, end_line)

                        if gfm_supported:
-                            if get_settings().pr_reviewer.extra_issue_links:
-                                issue_content_linked =copy.deepcopy(issue_content)
-                                referenced_variables_list = issue.get('referenced_variables', [])
-                                for component in referenced_variables_list:
-                                    name = component['variable_name'].strip().strip('`')
-
-                                    ind = issue_content.find(name)
-                                    if ind != -1:
-                                        reference_link_component = git_provider.get_line_link(relevant_file, component['relevant_line'], component['relevant_line'])
-                                        issue_content_linked = issue_content_linked[:ind-1] + f"[`{name}`]({reference_link_component})" + issue_content_linked[ind+len(name)+1:]
-                                    else:
-                                        get_logger().info(f"Failed to find variable in issue content: {component['variable_name'].strip()}")
-                                issue_content = issue_content_linked
                            issue_str = f"<a href='{reference_link}'><strong>{issue_header}</strong></a><br>{issue_content}"
                        else:
                            issue_str = f"[**{issue_header}**]({reference_link})\n\n{issue_content}\n\n"
@ -945,3 +933,66 @@ def is_value_no(value):
    if value_str == 'no' or value_str == 'none' or value_str == 'false':
        return True
    return False
+
+
+def process_description(description_full: str):
+    split_str = "### **Changes walkthrough** 📝"
+    description_split = description_full.split(split_str)
+    base_description_str = description_split[0]
+    changes_walkthrough_str = ""
+    files = []
+    if len(description_split) > 1:
+        changes_walkthrough_str = description_split[1]
+    else:
+        get_logger().debug("No changes walkthrough found")
+
+    try:
+        if changes_walkthrough_str:
+            # get the end of the table
+            if '</table>\n\n___' in changes_walkthrough_str:
+                end = changes_walkthrough_str.index("</table>\n\n___")
+            elif '\n___' in changes_walkthrough_str:
+                end = changes_walkthrough_str.index("\n___")
+            else:
+                end = len(changes_walkthrough_str)
+            changes_walkthrough_str = changes_walkthrough_str[:end]
+
+            h = html2text.HTML2Text()
+            h.body_width = 0  # Disable line wrapping
+
+            # find all the files
+            pattern = r'<tr>\s*<td>\s*(<details>\s*<summary>(.*?)</summary>(.*?)</details>)\s*</td>'
+            files_found = re.findall(pattern, changes_walkthrough_str, re.DOTALL)
+            for file_data in files_found:
+                try:
+                    if isinstance(file_data, tuple):
+                        file_data = file_data[0]
+                    # pattern = r'<details>\s*<summary><strong>(.*?)</strong><dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\s*((?:\*.*\s*)*)</details>'
+                    pattern = r'<details>\s*<summary><strong>(.*?)</strong><dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\n\n\s*(.*?)</details>'
+                    res = re.search(pattern, file_data, re.DOTALL)
+                    if res and res.lastindex == 4:
+                        short_filename = res.group(1).strip()
+                        short_summary = res.group(2).strip()
+                        long_filename = res.group(3).strip()
+                        long_summary =  res.group(4).strip()
+                        long_summary = long_summary.replace('<br> *', '\n*').replace('<br>','').replace('\n','<br>')
+                        long_summary = h.handle(long_summary).strip()
+                        if not long_summary.startswith('*'):
+                            long_summary = f"* {long_summary}"
+
+                        files.append({
+                            'short_file_name': short_filename,
+                            'full_file_name': long_filename,
+                            'short_summary': short_summary,
+                            'long_summary': long_summary
+                        })
+                    else:
+                        get_logger().error(f"Failed to parse description", artifact={'description': file_data})
+                except Exception as e:
+                    get_logger().exception(f"Failed to process description: {e}", artifact={'description': file_data})
+
+
+    except Exception as e:
+        get_logger().exception(f"Failed to process description: {e}")
+
+    return base_description_str, files