Merge pull request #638 from Codium-ai/tr/describe_bullets

insert_br_after_x_chars
This commit is contained in:
Tal
2024-02-05 02:06:13 -08:00
committed by GitHub
19 changed files with 211 additions and 157 deletions

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import re
from pr_agent.config_loader import get_settings
from pr_agent.git_providers.git_provider import EDIT_TYPE
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from pr_agent.log import get_logger

View File

@ -1,9 +1,7 @@
from __future__ import annotations
import difflib
import re
import traceback
from typing import Any, Callable, List, Tuple
from typing import Callable, List, Tuple
from github import RateLimitExceededException
@ -13,7 +11,8 @@ from pr_agent.algo.file_filter import filter_ignored
from pr_agent.algo.token_handler import TokenHandler
from pr_agent.algo.utils import get_max_tokens, ModelType
from pr_agent.config_loader import get_settings
from pr_agent.git_providers.git_provider import FilePatchInfo, GitProvider, EDIT_TYPE
from pr_agent.git_providers.git_provider import GitProvider
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from pr_agent.log import get_logger
DELETED_FILES_ = "Deleted files:\n"
@ -270,78 +269,6 @@ def _get_all_deployments(all_models: List[str]) -> List[str]:
return all_deployments
def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo],
relevant_file: str,
relevant_line_in_file: str,
absolute_position: int = None) -> Tuple[int, int]:
position = -1
if absolute_position is None:
absolute_position = -1
re_hunk_header = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
for file in diff_files:
if file.filename and (file.filename.strip() == relevant_file):
patch = file.patch
patch_lines = patch.splitlines()
delta = 0
start1, size1, start2, size2 = 0, 0, 0, 0
if absolute_position != -1: # matching absolute to relative
for i, line in enumerate(patch_lines):
# new hunk
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
#
absolute_position_curr = start2 + delta - 1
if absolute_position_curr == absolute_position:
position = i
break
else:
# try to find the line in the patch using difflib, with some margin of error
matches_difflib: list[str | Any] = difflib.get_close_matches(relevant_line_in_file,
patch_lines, n=3, cutoff=0.93)
if len(matches_difflib) == 1 and matches_difflib[0].startswith('+'):
relevant_line_in_file = matches_difflib[0]
for i, line in enumerate(patch_lines):
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
if relevant_line_in_file in line and line[0] != '-':
position = i
absolute_position = start2 + delta - 1
break
if position == -1 and relevant_line_in_file[0] == '+':
no_plus_line = relevant_line_in_file[1:].lstrip()
for i, line in enumerate(patch_lines):
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
if no_plus_line in line and line[0] != '-':
# The model might add a '+' to the beginning of the relevant_line_in_file even if originally
# it's a context line
position = i
absolute_position = start2 + delta - 1
break
return position, absolute_position
def get_pr_multi_diffs(git_provider: GitProvider,
token_handler: TokenHandler,
model: str,

23
pr_agent/algo/types.py Normal file
View File

@ -0,0 +1,23 @@
from dataclasses import dataclass
from enum import Enum
class EDIT_TYPE(Enum):
ADDED = 1
DELETED = 2
MODIFIED = 3
RENAMED = 4
UNKNOWN = 5
@dataclass
class FilePatchInfo:
base_file: str
head_file: str
patch: str
filename: str
tokens: int = -1
edit_type: EDIT_TYPE = EDIT_TYPE.UNKNOWN
old_filename: str = None
num_plus_lines: int = -1
num_minus_lines: int = -1

View File

@ -6,7 +6,7 @@ import re
import textwrap
from datetime import datetime
from enum import Enum
from typing import Any, List
from typing import Any, List, Tuple
import yaml
from starlette_context import context
@ -14,6 +14,7 @@ from starlette_context import context
from pr_agent.algo import MAX_TOKENS
from pr_agent.algo.token_handler import get_token_encoder
from pr_agent.config_loader import get_settings, global_settings
from pr_agent.algo.types import FilePatchInfo
from pr_agent.log import get_logger
class ModelType(str, Enum):
@ -487,4 +488,76 @@ def replace_code_tags(text):
parts = text.split('`')
for i in range(1, len(parts), 2):
parts[i] = '<code>' + parts[i] + '</code>'
return ''.join(parts)
return ''.join(parts)
def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo],
relevant_file: str,
relevant_line_in_file: str,
absolute_position: int = None) -> Tuple[int, int]:
position = -1
if absolute_position is None:
absolute_position = -1
re_hunk_header = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
for file in diff_files:
if file.filename and (file.filename.strip() == relevant_file):
patch = file.patch
patch_lines = patch.splitlines()
delta = 0
start1, size1, start2, size2 = 0, 0, 0, 0
if absolute_position != -1: # matching absolute to relative
for i, line in enumerate(patch_lines):
# new hunk
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
#
absolute_position_curr = start2 + delta - 1
if absolute_position_curr == absolute_position:
position = i
break
else:
# try to find the line in the patch using difflib, with some margin of error
matches_difflib: list[str | Any] = difflib.get_close_matches(relevant_line_in_file,
patch_lines, n=3, cutoff=0.93)
if len(matches_difflib) == 1 and matches_difflib[0].startswith('+'):
relevant_line_in_file = matches_difflib[0]
for i, line in enumerate(patch_lines):
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
if relevant_line_in_file in line and line[0] != '-':
position = i
absolute_position = start2 + delta - 1
break
if position == -1 and relevant_line_in_file[0] == '+':
no_plus_line = relevant_line_in_file[1:].lstrip()
for i, line in enumerate(patch_lines):
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
if no_plus_line in line and line[0] != '-':
# The model might add a '+' to the beginning of the relevant_line_in_file even if originally
# it's a context line
position = i
absolute_position = start2 + delta - 1
break
return position, absolute_position

View File

@ -6,7 +6,8 @@ from ..log import get_logger
from ..algo.language_handler import is_valid_file
from ..algo.utils import clip_tokens, load_large_diff
from ..config_loader import get_settings
from .git_provider import EDIT_TYPE, FilePatchInfo, GitProvider
from .git_provider import GitProvider
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
AZURE_DEVOPS_AVAILABLE = True

View File

@ -6,10 +6,11 @@ import requests
from atlassian.bitbucket import Cloud
from starlette_context import context
from ..algo.pr_processing import find_line_number_of_relevant_line_in_file
from pr_agent.algo.types import FilePatchInfo, EDIT_TYPE
from ..algo.utils import find_line_number_of_relevant_line_in_file
from ..config_loader import get_settings
from ..log import get_logger
from .git_provider import FilePatchInfo, GitProvider, EDIT_TYPE
from .git_provider import GitProvider
class BitbucketProvider(GitProvider):

View File

@ -6,9 +6,9 @@ import requests
from atlassian.bitbucket import Bitbucket
from starlette_context import context
from .git_provider import FilePatchInfo, GitProvider, EDIT_TYPE
from ..algo.pr_processing import find_line_number_of_relevant_line_in_file
from ..algo.utils import load_large_diff
from .git_provider import GitProvider
from pr_agent.algo.types import FilePatchInfo
from ..algo.utils import load_large_diff, find_line_number_of_relevant_line_in_file
from ..config_loader import get_settings
from ..log import get_logger

View File

@ -5,9 +5,9 @@ from typing import List, Optional, Tuple
from urllib.parse import urlparse
from pr_agent.git_providers.codecommit_client import CodeCommitClient
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from ..algo.utils import load_large_diff
from .git_provider import EDIT_TYPE, FilePatchInfo, GitProvider
from .git_provider import GitProvider
from ..config_loader import get_settings
from ..log import get_logger

View File

@ -13,7 +13,8 @@ import urllib3.util
from git import Repo
from pr_agent.config_loader import get_settings
from pr_agent.git_providers.git_provider import EDIT_TYPE, FilePatchInfo, GitProvider
from pr_agent.git_providers.git_provider import GitProvider
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from pr_agent.git_providers.local_git_provider import PullRequestMimic
from pr_agent.log import get_logger

View File

@ -1,35 +1,13 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
# enum EDIT_TYPE (ADDED, DELETED, MODIFIED, RENAMED)
from enum import Enum
from typing import Optional
from pr_agent.config_loader import get_settings
from pr_agent.algo.types import FilePatchInfo
from pr_agent.log import get_logger
class EDIT_TYPE(Enum):
ADDED = 1
DELETED = 2
MODIFIED = 3
RENAMED = 4
UNKNOWN = 5
@dataclass
class FilePatchInfo:
base_file: str
head_file: str
patch: str
filename: str
tokens: int = -1
edit_type: EDIT_TYPE = EDIT_TYPE.UNKNOWN
old_filename: str = None
num_plus_lines: int = -1
num_minus_lines: int = -1
class GitProvider(ABC):
@abstractmethod
def is_supported(self, capability: str) -> bool:

View File

@ -9,12 +9,12 @@ from retry import retry
from starlette_context import context
from ..algo.language_handler import is_valid_file
from ..algo.pr_processing import find_line_number_of_relevant_line_in_file
from ..algo.utils import load_large_diff, clip_tokens
from ..algo.utils import load_large_diff, clip_tokens, find_line_number_of_relevant_line_in_file
from ..config_loader import get_settings
from ..log import get_logger
from ..servers.utils import RateLimitExceeded
from .git_provider import FilePatchInfo, GitProvider, IncrementalPR, EDIT_TYPE
from .git_provider import GitProvider, IncrementalPR
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
class GithubProvider(GitProvider):

View File

@ -7,10 +7,10 @@ import gitlab
from gitlab import GitlabGetError
from ..algo.language_handler import is_valid_file
from ..algo.pr_processing import find_line_number_of_relevant_line_in_file
from ..algo.utils import load_large_diff, clip_tokens
from ..algo.utils import load_large_diff, clip_tokens, find_line_number_of_relevant_line_in_file
from ..config_loader import get_settings
from .git_provider import EDIT_TYPE, FilePatchInfo, GitProvider
from .git_provider import GitProvider
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from ..log import get_logger

View File

@ -5,7 +5,8 @@ from typing import List
from git import Repo
from pr_agent.config_loader import _find_repository_root, get_settings
from pr_agent.git_providers.git_provider import EDIT_TYPE, FilePatchInfo, GitProvider
from pr_agent.git_providers.git_provider import GitProvider
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from pr_agent.log import get_logger

View File

@ -9,11 +9,11 @@ verbosity_level=0 # 0,1,2
use_extra_bad_extensions=false
use_repo_settings_file=true
use_global_settings_file=true
ai_timeout=180
ai_timeout=90
max_description_tokens = 500
max_commits_tokens = 500
max_model_tokens = 32000 # Limits the maximum number of tokens that can be used by any model, regardless of the model's default capabilities.
patch_extra_lines = 3
patch_extra_lines = 1
secret_provider="google_cloud_storage"
cli_mode=false

View File

@ -325,7 +325,7 @@ class PRCodeSuggestions:
pr_body += "<table>"
header = f"Suggestions"
delta = 77
delta = 75
header += "&nbsp; " * delta
pr_body += f"""<thead><tr><th></th><th>{header}</th></tr></thead>"""
pr_body += """<tbody>"""

View File

@ -9,7 +9,7 @@ from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler
from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler
from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models
from pr_agent.algo.token_handler import TokenHandler
from pr_agent.algo.utils import load_yaml, set_custom_labels, get_user_labels
from pr_agent.algo.utils import load_yaml, set_custom_labels, get_user_labels, ModelType
from pr_agent.config_loader import get_settings
from pr_agent.git_providers import get_git_provider
from pr_agent.git_providers.git_provider import get_main_pr_language
@ -80,7 +80,7 @@ class PRDescription:
if get_settings().config.publish_output:
self.git_provider.publish_comment("Preparing PR description...", is_temporary=True)
await retry_with_fallback_models(self._prepare_prediction)
await retry_with_fallback_models(self._prepare_prediction, ModelType.TURBO) # turbo model because larger context
get_logger().info(f"Preparing answer {self.pr_id}")
if self.prediction:
@ -363,7 +363,7 @@ class PRDescription:
try:
pr_body += "<table>"
header = f"Relevant files"
delta = 77
delta = 75
# header += "&nbsp; " * delta
pr_body += f"""<thead><tr><th></th><th align="left">{header}</th></tr></thead>"""
pr_body += """<tbody>"""
@ -379,8 +379,7 @@ class PRDescription:
for filename, file_changes_title, file_change_description in list_tuples:
filename = filename.replace("'", "`")
filename_publish = filename.split("/")[-1]
file_changes_title_br = insert_br_after_x_chars(file_changes_title, x=(delta - 5),
new_line_char="\n\n")
file_changes_title_br = insert_br_after_x_chars(file_changes_title, x=(delta - 5))
file_changes_title_extended = file_changes_title_br.strip() + "</code>"
if len(file_changes_title_extended) < (delta - 5):
file_changes_title_extended += "&nbsp; " * ((delta - 5) - len(file_changes_title_extended))
@ -428,48 +427,74 @@ class PRDescription:
pass
return pr_body
def insert_br_after_x_chars(text, x=70, new_line_char="<br> "):
def insert_br_after_x_chars(text, x=70):
"""
Insert <br> into a string after a word that increases its length above x characters.
Use proper HTML tags for code and new lines.
"""
if len(text) < x:
return text
lines = text.splitlines()
# replace odd instances of ` with <code> and even instances of ` with </code>
text = replace_code_tags(text)
# convert list items to <li>
if text.startswith("- "):
text = "<li>" + text[2:]
text = text.replace("\n- ", '<br><li> ').replace("\n - ", '<br><li> ')
# convert new lines to <br>
text = text.replace("\n", '<br>')
# split text into lines
lines = text.split('<br>')
words = []
for i,line in enumerate(lines):
for i, line in enumerate(lines):
words += line.split(' ')
if i<len(lines)-1:
words[-1] += "\n"
if i < len(lines) - 1:
words[-1] += "<br>"
def count_chars_without_html(string):
if '<' not in string:
return len(string)
no_html_string = re.sub('<[^>]+>', '', string)
return len(no_html_string)
# words = text.split(' ')
new_text = ""
current_length = 0
new_text = []
is_inside_code = False
current_length = 0
for word in words:
# Check if adding this word exceeds x characters
if current_length + len(word) > x:
if not is_inside_code:
new_text += f"{new_line_char} " # Insert line break
current_length = 0 # Reset counter
else:
new_text += f"`{new_line_char} `"
# check if inside <code> tag
if word.startswith("`") and not is_inside_code and not word.endswith("`"):
is_saved_word = False
if word == "<code>" or word == "</code>" or word == "<li>" or word == "<br>":
is_saved_word = True
if "<code>" in word:
is_inside_code = True
if word.endswith("`"):
if "</code>" in word:
is_inside_code = False
# Add the word to the new text
if word.endswith("\n"):
new_text += word
else:
new_text += word + " "
current_length += len(word) + 1 # Add 1 for the space
len_word = count_chars_without_html(word)
if not is_saved_word and (current_length + len_word > x):
if is_inside_code:
new_text.append("</code><br><code>")
else:
new_text.append("<br>")
current_length = 0 # Reset counter
new_text.append(word + " ")
if not is_saved_word:
current_length += len_word + 1 # Add 1 for the space
if word.endswith("\n"):
if word == "<li>" or word == "<br>":
current_length = 0
return new_text.strip() # Remove trailing space
return ''.join(new_text).strip()
def replace_code_tags(text):
"""
Replace odd instances of ` with <code> and even instances of ` with </code>
"""
parts = text.split('`')
for i in range(1, len(parts), 2):
parts[i] = '<code>' + parts[i] + '</code>'
return ''.join(parts)

View File

@ -3,7 +3,7 @@ from unittest.mock import patch
from pr_agent.git_providers.codecommit_provider import CodeCommitFile
from pr_agent.git_providers.codecommit_provider import CodeCommitProvider
from pr_agent.git_providers.codecommit_provider import PullRequestCCMimic
from pr_agent.git_providers.git_provider import EDIT_TYPE
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
class TestCodeCommitFile:

View File

@ -1,5 +1,6 @@
# Generated by CodiumAI
from pr_agent.algo.utils import convert_to_markdown
from pr_agent.tools.pr_description import insert_br_after_x_chars
"""
Code Analysis
@ -93,3 +94,27 @@ class TestConvertToMarkdown:
}
expected_output = ''
assert convert_to_markdown(input_data).strip() == expected_output.strip()
class TestBR:
def test_br1(self):
file_change_description = '- Imported `FilePatchInfo` and `EDIT_TYPE` from `pr_agent.algo.types` instead of `pr_agent.git_providers.git_provider`.'
file_change_description_br = insert_br_after_x_chars(file_change_description)
expected_output = ('<li>Imported <code>FilePatchInfo</code> and <code>EDIT_TYPE</code> from '
'<code>pr_agent.algo.types</code> instead <br>of '
'<code>pr_agent.git_providers.git_provider</code>.')
assert file_change_description_br == expected_output
# print("-----")
# print(file_change_description_br)
def test_br2(self):
file_change_description = ('- Created a - new -class `ColorPaletteResourcesCollection ColorPaletteResourcesCollection '
'ColorPaletteResourcesCollection ColorPaletteResourcesCollection`')
file_change_description_br = insert_br_after_x_chars(file_change_description)
expected_output = ('<li>Created a - new -class <code>ColorPaletteResourcesCollection </code><br><code>'
'ColorPaletteResourcesCollection ColorPaletteResourcesCollection <br>'
'ColorPaletteResourcesCollection</code>')
assert file_change_description_br == expected_output
# print("-----")
# print(file_change_description_br)

View File

@ -1,8 +1,7 @@
# Generated by CodiumAI
from pr_agent.git_providers.git_provider import FilePatchInfo
from pr_agent.algo.pr_processing import find_line_number_of_relevant_line_in_file
from pr_agent.algo.types import FilePatchInfo
from pr_agent.algo.utils import find_line_number_of_relevant_line_in_file
import pytest