Merge commit 'e4f177908b620e46740b03966fda9243473d979e' into hl/pr_review_table

2025-07-21 04:50:39 +08:00 · 2024-02-08 14:26:29 +02:00
parent ddb89a7474 e4f177908b
commit c2088b7752
41 changed files with 653 additions and 383 deletions
--- a/pr_agent/algo/init.py
+++ b/pr_agent/algo/init.py
@ -9,6 +9,7 @@ MAX_TOKENS = {
    'gpt-4-0613': 8000,
    'gpt-4-32k': 32000,
    'gpt-4-1106-preview': 128000, # 128K, but may be limited by config.max_model_tokens
+    'gpt-4-0125-preview': 128000,  # 128K, but may be limited by config.max_model_tokens
    'claude-instant-1': 100000,
    'claude-2': 100000,
    'command-nightly': 4096,
--- a/pr_agent/algo/git_patch_processing.py
+++ b/pr_agent/algo/git_patch_processing.py
@ -3,7 +3,7 @@ from __future__ import annotations
 import re

 from pr_agent.config_loader import get_settings
-from pr_agent.git_providers.git_provider import EDIT_TYPE
+from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
 from pr_agent.log import get_logger


@ -181,7 +181,7 @@ __old hunk__
           ...
    """
    
-    patch_with_lines_str = f"\n\n## {file.filename}\n"
+    patch_with_lines_str = f"\n\n## file: '{file.filename.strip()}'\n"
    patch_lines = patch.splitlines()
    RE_HUNK_HEADER = re.compile(
        r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
@ -202,11 +202,11 @@ __old hunk__
                if new_content_lines:
                    if prev_header_line:
                        patch_with_lines_str += f'\n{prev_header_line}\n'
-                    patch_with_lines_str += '__new hunk__\n'
+                    patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__new hunk__\n'
                    for i, line_new in enumerate(new_content_lines):
                        patch_with_lines_str += f"{start2 + i} {line_new}\n"
                if old_content_lines:
-                    patch_with_lines_str += '__old hunk__\n'
+                    patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__old hunk__\n'
                    for line_old in old_content_lines:
                        patch_with_lines_str += f"{line_old}\n"
                new_content_lines = []
@ -236,11 +236,11 @@ __old hunk__
    if match and new_content_lines:
        if new_content_lines:
            patch_with_lines_str += f'\n{header_line}\n'
-            patch_with_lines_str += '\n__new hunk__\n'
+            patch_with_lines_str = patch_with_lines_str.rstrip()+ '\n__new hunk__\n'
            for i, line_new in enumerate(new_content_lines):
                patch_with_lines_str += f"{start2 + i} {line_new}\n"
        if old_content_lines:
-            patch_with_lines_str += '\n__old hunk__\n'
+            patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
            for line_old in old_content_lines:
                patch_with_lines_str += f"{line_old}\n"

--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@ -1,9 +1,7 @@
 from __future__ import annotations

-import difflib
-import re
 import traceback
-from typing import Any, Callable, List, Tuple
+from typing import Callable, List, Tuple

 from github import RateLimitExceededException

@ -11,9 +9,10 @@ from pr_agent.algo.git_patch_processing import convert_to_hunks_with_lines_numbe
 from pr_agent.algo.language_handler import sort_files_by_main_languages
 from pr_agent.algo.file_filter import filter_ignored
 from pr_agent.algo.token_handler import TokenHandler
-from pr_agent.algo.utils import get_max_tokens
+from pr_agent.algo.utils import get_max_tokens, ModelType
 from pr_agent.config_loader import get_settings
-from pr_agent.git_providers.git_provider import FilePatchInfo, GitProvider, EDIT_TYPE
+from pr_agent.git_providers.git_provider import GitProvider
+from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
 from pr_agent.log import get_logger

 DELETED_FILES_ = "Deleted files:\n"
@ -209,9 +208,9 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo

        if patch:
            if not convert_hunks_to_line_numbers:
-                patch_final = f"## {file.filename}\n\n{patch}\n"
+                patch_final = f"\n\n## file: '{file.filename.strip()}\n\n{patch.strip()}\n'"
            else:
-                patch_final = patch
+                patch_final = "\n\n" + patch.strip()
            patches.append(patch_final)
            total_tokens += token_handler.count_tokens(patch_final)
            if get_settings().config.verbosity_level >= 2:
@ -220,8 +219,8 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
    return patches, modified_files_list, deleted_files_list, added_files_list


-async def retry_with_fallback_models(f: Callable):
-    all_models = _get_all_models()
+async def retry_with_fallback_models(f: Callable, model_type: ModelType = ModelType.REGULAR):
+    all_models = _get_all_models(model_type)
    all_deployments = _get_all_deployments(all_models)
    # try each (model, deployment_id) pair until one is successful, otherwise raise exception
    for i, (model, deployment_id) in enumerate(zip(all_models, all_deployments)):
@ -243,8 +242,11 @@ async def retry_with_fallback_models(f: Callable):
                raise  # Re-raise the last exception


-def _get_all_models() -> List[str]:
-    model = get_settings().config.model
+def _get_all_models(model_type: ModelType = ModelType.REGULAR) -> List[str]:
+    if model_type == ModelType.TURBO:
+        model = get_settings().config.model_turbo
+    else:
+        model = get_settings().config.model
    fallback_models = get_settings().config.fallback_models
    if not isinstance(fallback_models, list):
        fallback_models = [m.strip() for m in fallback_models.split(",")]
@ -267,78 +269,6 @@ def _get_all_deployments(all_models: List[str]) -> List[str]:
    return all_deployments


-def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo],
-                                              relevant_file: str,
-                                              relevant_line_in_file: str,
-                                              absolute_position: int = None) -> Tuple[int, int]:
-    position = -1
-    if absolute_position is None:
-        absolute_position = -1
-    re_hunk_header = re.compile(
-        r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
-
-    for file in diff_files:
-        if file.filename and (file.filename.strip() == relevant_file):
-            patch = file.patch
-            patch_lines = patch.splitlines()
-            delta = 0
-            start1, size1, start2, size2 = 0, 0, 0, 0
-            if absolute_position != -1: # matching absolute to relative
-                for i, line in enumerate(patch_lines):
-                    # new hunk
-                    if line.startswith('@@'):
-                        delta = 0
-                        match = re_hunk_header.match(line)
-                        start1, size1, start2, size2 = map(int, match.groups()[:4])
-                    elif not line.startswith('-'):
-                        delta += 1
-
-                    #
-                    absolute_position_curr = start2 + delta - 1
-
-                    if absolute_position_curr == absolute_position:
-                        position = i
-                        break
-            else:
-                # try to find the line in the patch using difflib, with some margin of error
-                matches_difflib: list[str | Any] = difflib.get_close_matches(relevant_line_in_file,
-                                                                             patch_lines, n=3, cutoff=0.93)
-                if len(matches_difflib) == 1 and matches_difflib[0].startswith('+'):
-                    relevant_line_in_file = matches_difflib[0]
-
-
-                for i, line in enumerate(patch_lines):
-                    if line.startswith('@@'):
-                        delta = 0
-                        match = re_hunk_header.match(line)
-                        start1, size1, start2, size2 = map(int, match.groups()[:4])
-                    elif not line.startswith('-'):
-                        delta += 1
-
-                    if relevant_line_in_file in line and line[0] != '-':
-                        position = i
-                        absolute_position = start2 + delta - 1
-                        break
-
-                if position == -1 and relevant_line_in_file[0] == '+':
-                    no_plus_line = relevant_line_in_file[1:].lstrip()
-                    for i, line in enumerate(patch_lines):
-                        if line.startswith('@@'):
-                            delta = 0
-                            match = re_hunk_header.match(line)
-                            start1, size1, start2, size2 = map(int, match.groups()[:4])
-                        elif not line.startswith('-'):
-                            delta += 1
-
-                        if no_plus_line in line and line[0] != '-':
-                            # The model might add a '+' to the beginning of the relevant_line_in_file even if originally
-                            # it's a context line
-                            position = i
-                            absolute_position = start2 + delta - 1
-                            break
-    return position, absolute_position
-
-
 def get_pr_multi_diffs(git_provider: GitProvider,
                       token_handler: TokenHandler,
                       model: str,
@ -375,6 +305,13 @@ def get_pr_multi_diffs(git_provider: GitProvider,
    for lang in pr_languages:
        sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True))

+
+    # try first a single run with standard diff string, with patch extension, and no deletions
+    patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff(
+        pr_languages, token_handler, add_line_numbers_to_hunks=True)
+    if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < get_max_tokens(model):
+        return ["\n".join(patches_extended)]
+
    patches = []
    final_diff_list = []
    total_tokens = token_handler.prompt_tokens
@ -398,6 +335,11 @@ def get_pr_multi_diffs(git_provider: GitProvider,

        patch = convert_to_hunks_with_lines_numbers(patch, file)
        new_patch_tokens = token_handler.count_tokens(patch)
+
+        if patch and (token_handler.prompt_tokens + new_patch_tokens) > get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD:
+            get_logger().warning(f"Patch too large, skipping: {file.filename}")
+            continue
+
        if patch and (total_tokens + new_patch_tokens > get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD):
            final_diff = "\n".join(patches)
            final_diff_list.append(final_diff)
--- a/pr_agent/algo/types.py
+++ b/pr_agent/algo/types.py
@ -0,0 +1,23 @@
+from dataclasses import dataclass
+from enum import Enum
+
+
+class EDIT_TYPE(Enum):
+    ADDED = 1
+    DELETED = 2
+    MODIFIED = 3
+    RENAMED = 4
+    UNKNOWN = 5
+
+
+@dataclass
+class FilePatchInfo:
+    base_file: str
+    head_file: str
+    patch: str
+    filename: str
+    tokens: int = -1
+    edit_type: EDIT_TYPE = EDIT_TYPE.UNKNOWN
+    old_filename: str = None
+    num_plus_lines: int = -1
+    num_minus_lines: int = -1
--- a/pr_agent/algo/utils.py
+++ b/pr_agent/algo/utils.py
@ -5,7 +5,8 @@ import json
 import re
 import textwrap
 from datetime import datetime
-from typing import Any, List
+from enum import Enum
+from typing import Any, List, Tuple

 import yaml
 from starlette_context import context
@ -13,8 +14,12 @@ from starlette_context import context
 from pr_agent.algo import MAX_TOKENS
 from pr_agent.algo.token_handler import get_token_encoder
 from pr_agent.config_loader import get_settings, global_settings
+from pr_agent.algo.types import FilePatchInfo
 from pr_agent.log import get_logger

+class ModelType(str, Enum):
+    REGULAR = "regular"
+    TURBO = "turbo"

 def get_setting(key: str) -> Any:
    try:
@ -489,4 +494,76 @@ def replace_code_tags(text):
    parts = text.split('`')
    for i in range(1, len(parts), 2):
        parts[i] = '<code>' + parts[i] + '</code>'
-    return ''.join(parts)
+    return ''.join(parts)
+
+
+def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo],
+                                              relevant_file: str,
+                                              relevant_line_in_file: str,
+                                              absolute_position: int = None) -> Tuple[int, int]:
+    position = -1
+    if absolute_position is None:
+        absolute_position = -1
+    re_hunk_header = re.compile(
+        r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
+
+    for file in diff_files:
+        if file.filename and (file.filename.strip() == relevant_file):
+            patch = file.patch
+            patch_lines = patch.splitlines()
+            delta = 0
+            start1, size1, start2, size2 = 0, 0, 0, 0
+            if absolute_position != -1: # matching absolute to relative
+                for i, line in enumerate(patch_lines):
+                    # new hunk
+                    if line.startswith('@@'):
+                        delta = 0
+                        match = re_hunk_header.match(line)
+                        start1, size1, start2, size2 = map(int, match.groups()[:4])
+                    elif not line.startswith('-'):
+                        delta += 1
+
+                    #
+                    absolute_position_curr = start2 + delta - 1
+
+                    if absolute_position_curr == absolute_position:
+                        position = i
+                        break
+            else:
+                # try to find the line in the patch using difflib, with some margin of error
+                matches_difflib: list[str | Any] = difflib.get_close_matches(relevant_line_in_file,
+                                                                             patch_lines, n=3, cutoff=0.93)
+                if len(matches_difflib) == 1 and matches_difflib[0].startswith('+'):
+                    relevant_line_in_file = matches_difflib[0]
+
+
+                for i, line in enumerate(patch_lines):
+                    if line.startswith('@@'):
+                        delta = 0
+                        match = re_hunk_header.match(line)
+                        start1, size1, start2, size2 = map(int, match.groups()[:4])
+                    elif not line.startswith('-'):
+                        delta += 1
+
+                    if relevant_line_in_file in line and line[0] != '-':
+                        position = i
+                        absolute_position = start2 + delta - 1
+                        break
+
+                if position == -1 and relevant_line_in_file[0] == '+':
+                    no_plus_line = relevant_line_in_file[1:].lstrip()
+                    for i, line in enumerate(patch_lines):
+                        if line.startswith('@@'):
+                            delta = 0
+                            match = re_hunk_header.match(line)
+                            start1, size1, start2, size2 = map(int, match.groups()[:4])
+                        elif not line.startswith('-'):
+                            delta += 1
+
+                        if no_plus_line in line and line[0] != '-':
+                            # The model might add a '+' to the beginning of the relevant_line_in_file even if originally
+                            # it's a context line
+                            position = i
+                            absolute_position = start2 + delta - 1
+                            break
+    return position, absolute_position