Merge pull request #638 from Codium-ai/tr/describe_bullets

insert_br_after_x_chars
This commit is contained in:
Tal
2024-02-05 02:06:13 -08:00
committed by GitHub
19 changed files with 211 additions and 157 deletions

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import re import re
from pr_agent.config_loader import get_settings from pr_agent.config_loader import get_settings
from pr_agent.git_providers.git_provider import EDIT_TYPE from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from pr_agent.log import get_logger from pr_agent.log import get_logger

View File

@ -1,9 +1,7 @@
from __future__ import annotations from __future__ import annotations
import difflib
import re
import traceback import traceback
from typing import Any, Callable, List, Tuple from typing import Callable, List, Tuple
from github import RateLimitExceededException from github import RateLimitExceededException
@ -13,7 +11,8 @@ from pr_agent.algo.file_filter import filter_ignored
from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.token_handler import TokenHandler
from pr_agent.algo.utils import get_max_tokens, ModelType from pr_agent.algo.utils import get_max_tokens, ModelType
from pr_agent.config_loader import get_settings from pr_agent.config_loader import get_settings
from pr_agent.git_providers.git_provider import FilePatchInfo, GitProvider, EDIT_TYPE from pr_agent.git_providers.git_provider import GitProvider
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from pr_agent.log import get_logger from pr_agent.log import get_logger
DELETED_FILES_ = "Deleted files:\n" DELETED_FILES_ = "Deleted files:\n"
@ -270,78 +269,6 @@ def _get_all_deployments(all_models: List[str]) -> List[str]:
return all_deployments return all_deployments
def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo],
relevant_file: str,
relevant_line_in_file: str,
absolute_position: int = None) -> Tuple[int, int]:
position = -1
if absolute_position is None:
absolute_position = -1
re_hunk_header = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
for file in diff_files:
if file.filename and (file.filename.strip() == relevant_file):
patch = file.patch
patch_lines = patch.splitlines()
delta = 0
start1, size1, start2, size2 = 0, 0, 0, 0
if absolute_position != -1: # matching absolute to relative
for i, line in enumerate(patch_lines):
# new hunk
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
#
absolute_position_curr = start2 + delta - 1
if absolute_position_curr == absolute_position:
position = i
break
else:
# try to find the line in the patch using difflib, with some margin of error
matches_difflib: list[str | Any] = difflib.get_close_matches(relevant_line_in_file,
patch_lines, n=3, cutoff=0.93)
if len(matches_difflib) == 1 and matches_difflib[0].startswith('+'):
relevant_line_in_file = matches_difflib[0]
for i, line in enumerate(patch_lines):
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
if relevant_line_in_file in line and line[0] != '-':
position = i
absolute_position = start2 + delta - 1
break
if position == -1 and relevant_line_in_file[0] == '+':
no_plus_line = relevant_line_in_file[1:].lstrip()
for i, line in enumerate(patch_lines):
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
if no_plus_line in line and line[0] != '-':
# The model might add a '+' to the beginning of the relevant_line_in_file even if originally
# it's a context line
position = i
absolute_position = start2 + delta - 1
break
return position, absolute_position
def get_pr_multi_diffs(git_provider: GitProvider, def get_pr_multi_diffs(git_provider: GitProvider,
token_handler: TokenHandler, token_handler: TokenHandler,
model: str, model: str,

23
pr_agent/algo/types.py Normal file
View File

@ -0,0 +1,23 @@
from dataclasses import dataclass
from enum import Enum
class EDIT_TYPE(Enum):
ADDED = 1
DELETED = 2
MODIFIED = 3
RENAMED = 4
UNKNOWN = 5
@dataclass
class FilePatchInfo:
base_file: str
head_file: str
patch: str
filename: str
tokens: int = -1
edit_type: EDIT_TYPE = EDIT_TYPE.UNKNOWN
old_filename: str = None
num_plus_lines: int = -1
num_minus_lines: int = -1

View File

@ -6,7 +6,7 @@ import re
import textwrap import textwrap
from datetime import datetime from datetime import datetime
from enum import Enum from enum import Enum
from typing import Any, List from typing import Any, List, Tuple
import yaml import yaml
from starlette_context import context from starlette_context import context
@ -14,6 +14,7 @@ from starlette_context import context
from pr_agent.algo import MAX_TOKENS from pr_agent.algo import MAX_TOKENS
from pr_agent.algo.token_handler import get_token_encoder from pr_agent.algo.token_handler import get_token_encoder
from pr_agent.config_loader import get_settings, global_settings from pr_agent.config_loader import get_settings, global_settings
from pr_agent.algo.types import FilePatchInfo
from pr_agent.log import get_logger from pr_agent.log import get_logger
class ModelType(str, Enum): class ModelType(str, Enum):
@ -488,3 +489,75 @@ def replace_code_tags(text):
for i in range(1, len(parts), 2): for i in range(1, len(parts), 2):
parts[i] = '<code>' + parts[i] + '</code>' parts[i] = '<code>' + parts[i] + '</code>'
return ''.join(parts) return ''.join(parts)
def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo],
relevant_file: str,
relevant_line_in_file: str,
absolute_position: int = None) -> Tuple[int, int]:
position = -1
if absolute_position is None:
absolute_position = -1
re_hunk_header = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
for file in diff_files:
if file.filename and (file.filename.strip() == relevant_file):
patch = file.patch
patch_lines = patch.splitlines()
delta = 0
start1, size1, start2, size2 = 0, 0, 0, 0
if absolute_position != -1: # matching absolute to relative
for i, line in enumerate(patch_lines):
# new hunk
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
#
absolute_position_curr = start2 + delta - 1
if absolute_position_curr == absolute_position:
position = i
break
else:
# try to find the line in the patch using difflib, with some margin of error
matches_difflib: list[str | Any] = difflib.get_close_matches(relevant_line_in_file,
patch_lines, n=3, cutoff=0.93)
if len(matches_difflib) == 1 and matches_difflib[0].startswith('+'):
relevant_line_in_file = matches_difflib[0]
for i, line in enumerate(patch_lines):
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
if relevant_line_in_file in line and line[0] != '-':
position = i
absolute_position = start2 + delta - 1
break
if position == -1 and relevant_line_in_file[0] == '+':
no_plus_line = relevant_line_in_file[1:].lstrip()
for i, line in enumerate(patch_lines):
if line.startswith('@@'):
delta = 0
match = re_hunk_header.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
if no_plus_line in line and line[0] != '-':
# The model might add a '+' to the beginning of the relevant_line_in_file even if originally
# it's a context line
position = i
absolute_position = start2 + delta - 1
break
return position, absolute_position

View File

@ -6,7 +6,8 @@ from ..log import get_logger
from ..algo.language_handler import is_valid_file from ..algo.language_handler import is_valid_file
from ..algo.utils import clip_tokens, load_large_diff from ..algo.utils import clip_tokens, load_large_diff
from ..config_loader import get_settings from ..config_loader import get_settings
from .git_provider import EDIT_TYPE, FilePatchInfo, GitProvider from .git_provider import GitProvider
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
AZURE_DEVOPS_AVAILABLE = True AZURE_DEVOPS_AVAILABLE = True

View File

@ -6,10 +6,11 @@ import requests
from atlassian.bitbucket import Cloud from atlassian.bitbucket import Cloud
from starlette_context import context from starlette_context import context
from ..algo.pr_processing import find_line_number_of_relevant_line_in_file from pr_agent.algo.types import FilePatchInfo, EDIT_TYPE
from ..algo.utils import find_line_number_of_relevant_line_in_file
from ..config_loader import get_settings from ..config_loader import get_settings
from ..log import get_logger from ..log import get_logger
from .git_provider import FilePatchInfo, GitProvider, EDIT_TYPE from .git_provider import GitProvider
class BitbucketProvider(GitProvider): class BitbucketProvider(GitProvider):

View File

@ -6,9 +6,9 @@ import requests
from atlassian.bitbucket import Bitbucket from atlassian.bitbucket import Bitbucket
from starlette_context import context from starlette_context import context
from .git_provider import FilePatchInfo, GitProvider, EDIT_TYPE from .git_provider import GitProvider
from ..algo.pr_processing import find_line_number_of_relevant_line_in_file from pr_agent.algo.types import FilePatchInfo
from ..algo.utils import load_large_diff from ..algo.utils import load_large_diff, find_line_number_of_relevant_line_in_file
from ..config_loader import get_settings from ..config_loader import get_settings
from ..log import get_logger from ..log import get_logger

View File

@ -5,9 +5,9 @@ from typing import List, Optional, Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
from pr_agent.git_providers.codecommit_client import CodeCommitClient from pr_agent.git_providers.codecommit_client import CodeCommitClient
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from ..algo.utils import load_large_diff from ..algo.utils import load_large_diff
from .git_provider import EDIT_TYPE, FilePatchInfo, GitProvider from .git_provider import GitProvider
from ..config_loader import get_settings from ..config_loader import get_settings
from ..log import get_logger from ..log import get_logger

View File

@ -13,7 +13,8 @@ import urllib3.util
from git import Repo from git import Repo
from pr_agent.config_loader import get_settings from pr_agent.config_loader import get_settings
from pr_agent.git_providers.git_provider import EDIT_TYPE, FilePatchInfo, GitProvider from pr_agent.git_providers.git_provider import GitProvider
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from pr_agent.git_providers.local_git_provider import PullRequestMimic from pr_agent.git_providers.local_git_provider import PullRequestMimic
from pr_agent.log import get_logger from pr_agent.log import get_logger

View File

@ -1,35 +1,13 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass
# enum EDIT_TYPE (ADDED, DELETED, MODIFIED, RENAMED) # enum EDIT_TYPE (ADDED, DELETED, MODIFIED, RENAMED)
from enum import Enum
from typing import Optional from typing import Optional
from pr_agent.config_loader import get_settings from pr_agent.config_loader import get_settings
from pr_agent.algo.types import FilePatchInfo
from pr_agent.log import get_logger from pr_agent.log import get_logger
class EDIT_TYPE(Enum):
ADDED = 1
DELETED = 2
MODIFIED = 3
RENAMED = 4
UNKNOWN = 5
@dataclass
class FilePatchInfo:
base_file: str
head_file: str
patch: str
filename: str
tokens: int = -1
edit_type: EDIT_TYPE = EDIT_TYPE.UNKNOWN
old_filename: str = None
num_plus_lines: int = -1
num_minus_lines: int = -1
class GitProvider(ABC): class GitProvider(ABC):
@abstractmethod @abstractmethod
def is_supported(self, capability: str) -> bool: def is_supported(self, capability: str) -> bool:

View File

@ -9,12 +9,12 @@ from retry import retry
from starlette_context import context from starlette_context import context
from ..algo.language_handler import is_valid_file from ..algo.language_handler import is_valid_file
from ..algo.pr_processing import find_line_number_of_relevant_line_in_file from ..algo.utils import load_large_diff, clip_tokens, find_line_number_of_relevant_line_in_file
from ..algo.utils import load_large_diff, clip_tokens
from ..config_loader import get_settings from ..config_loader import get_settings
from ..log import get_logger from ..log import get_logger
from ..servers.utils import RateLimitExceeded from ..servers.utils import RateLimitExceeded
from .git_provider import FilePatchInfo, GitProvider, IncrementalPR, EDIT_TYPE from .git_provider import GitProvider, IncrementalPR
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
class GithubProvider(GitProvider): class GithubProvider(GitProvider):

View File

@ -7,10 +7,10 @@ import gitlab
from gitlab import GitlabGetError from gitlab import GitlabGetError
from ..algo.language_handler import is_valid_file from ..algo.language_handler import is_valid_file
from ..algo.pr_processing import find_line_number_of_relevant_line_in_file from ..algo.utils import load_large_diff, clip_tokens, find_line_number_of_relevant_line_in_file
from ..algo.utils import load_large_diff, clip_tokens
from ..config_loader import get_settings from ..config_loader import get_settings
from .git_provider import EDIT_TYPE, FilePatchInfo, GitProvider from .git_provider import GitProvider
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from ..log import get_logger from ..log import get_logger

View File

@ -5,7 +5,8 @@ from typing import List
from git import Repo from git import Repo
from pr_agent.config_loader import _find_repository_root, get_settings from pr_agent.config_loader import _find_repository_root, get_settings
from pr_agent.git_providers.git_provider import EDIT_TYPE, FilePatchInfo, GitProvider from pr_agent.git_providers.git_provider import GitProvider
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from pr_agent.log import get_logger from pr_agent.log import get_logger

View File

@ -9,11 +9,11 @@ verbosity_level=0 # 0,1,2
use_extra_bad_extensions=false use_extra_bad_extensions=false
use_repo_settings_file=true use_repo_settings_file=true
use_global_settings_file=true use_global_settings_file=true
ai_timeout=180 ai_timeout=90
max_description_tokens = 500 max_description_tokens = 500
max_commits_tokens = 500 max_commits_tokens = 500
max_model_tokens = 32000 # Limits the maximum number of tokens that can be used by any model, regardless of the model's default capabilities. max_model_tokens = 32000 # Limits the maximum number of tokens that can be used by any model, regardless of the model's default capabilities.
patch_extra_lines = 3 patch_extra_lines = 1
secret_provider="google_cloud_storage" secret_provider="google_cloud_storage"
cli_mode=false cli_mode=false

View File

@ -325,7 +325,7 @@ class PRCodeSuggestions:
pr_body += "<table>" pr_body += "<table>"
header = f"Suggestions" header = f"Suggestions"
delta = 77 delta = 75
header += "&nbsp; " * delta header += "&nbsp; " * delta
pr_body += f"""<thead><tr><th></th><th>{header}</th></tr></thead>""" pr_body += f"""<thead><tr><th></th><th>{header}</th></tr></thead>"""
pr_body += """<tbody>""" pr_body += """<tbody>"""

View File

@ -9,7 +9,7 @@ from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler
from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler
from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models
from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.token_handler import TokenHandler
from pr_agent.algo.utils import load_yaml, set_custom_labels, get_user_labels from pr_agent.algo.utils import load_yaml, set_custom_labels, get_user_labels, ModelType
from pr_agent.config_loader import get_settings from pr_agent.config_loader import get_settings
from pr_agent.git_providers import get_git_provider from pr_agent.git_providers import get_git_provider
from pr_agent.git_providers.git_provider import get_main_pr_language from pr_agent.git_providers.git_provider import get_main_pr_language
@ -80,7 +80,7 @@ class PRDescription:
if get_settings().config.publish_output: if get_settings().config.publish_output:
self.git_provider.publish_comment("Preparing PR description...", is_temporary=True) self.git_provider.publish_comment("Preparing PR description...", is_temporary=True)
await retry_with_fallback_models(self._prepare_prediction) await retry_with_fallback_models(self._prepare_prediction, ModelType.TURBO) # turbo model because larger context
get_logger().info(f"Preparing answer {self.pr_id}") get_logger().info(f"Preparing answer {self.pr_id}")
if self.prediction: if self.prediction:
@ -363,7 +363,7 @@ class PRDescription:
try: try:
pr_body += "<table>" pr_body += "<table>"
header = f"Relevant files" header = f"Relevant files"
delta = 77 delta = 75
# header += "&nbsp; " * delta # header += "&nbsp; " * delta
pr_body += f"""<thead><tr><th></th><th align="left">{header}</th></tr></thead>""" pr_body += f"""<thead><tr><th></th><th align="left">{header}</th></tr></thead>"""
pr_body += """<tbody>""" pr_body += """<tbody>"""
@ -379,8 +379,7 @@ class PRDescription:
for filename, file_changes_title, file_change_description in list_tuples: for filename, file_changes_title, file_change_description in list_tuples:
filename = filename.replace("'", "`") filename = filename.replace("'", "`")
filename_publish = filename.split("/")[-1] filename_publish = filename.split("/")[-1]
file_changes_title_br = insert_br_after_x_chars(file_changes_title, x=(delta - 5), file_changes_title_br = insert_br_after_x_chars(file_changes_title, x=(delta - 5))
new_line_char="\n\n")
file_changes_title_extended = file_changes_title_br.strip() + "</code>" file_changes_title_extended = file_changes_title_br.strip() + "</code>"
if len(file_changes_title_extended) < (delta - 5): if len(file_changes_title_extended) < (delta - 5):
file_changes_title_extended += "&nbsp; " * ((delta - 5) - len(file_changes_title_extended)) file_changes_title_extended += "&nbsp; " * ((delta - 5) - len(file_changes_title_extended))
@ -428,48 +427,74 @@ class PRDescription:
pass pass
return pr_body return pr_body
def insert_br_after_x_chars(text, x=70, new_line_char="<br> "): def insert_br_after_x_chars(text, x=70):
""" """
Insert <br> into a string after a word that increases its length above x characters. Insert <br> into a string after a word that increases its length above x characters.
Use proper HTML tags for code and new lines.
""" """
if len(text) < x: if len(text) < x:
return text return text
lines = text.splitlines() # replace odd instances of ` with <code> and even instances of ` with </code>
text = replace_code_tags(text)
# convert list items to <li>
if text.startswith("- "):
text = "<li>" + text[2:]
text = text.replace("\n- ", '<br><li> ').replace("\n - ", '<br><li> ')
# convert new lines to <br>
text = text.replace("\n", '<br>')
# split text into lines
lines = text.split('<br>')
words = [] words = []
for i,line in enumerate(lines): for i, line in enumerate(lines):
words += line.split(' ') words += line.split(' ')
if i<len(lines)-1: if i < len(lines) - 1:
words[-1] += "\n" words[-1] += "<br>"
def count_chars_without_html(string):
if '<' not in string:
return len(string)
no_html_string = re.sub('<[^>]+>', '', string)
return len(no_html_string)
# words = text.split(' ') new_text = []
new_text = ""
current_length = 0
is_inside_code = False is_inside_code = False
current_length = 0
for word in words: for word in words:
# Check if adding this word exceeds x characters is_saved_word = False
if current_length + len(word) > x: if word == "<code>" or word == "</code>" or word == "<li>" or word == "<br>":
if not is_inside_code: is_saved_word = True
new_text += f"{new_line_char} " # Insert line break if "<code>" in word:
current_length = 0 # Reset counter
else:
new_text += f"`{new_line_char} `"
# check if inside <code> tag
if word.startswith("`") and not is_inside_code and not word.endswith("`"):
is_inside_code = True is_inside_code = True
if word.endswith("`"): if "</code>" in word:
is_inside_code = False is_inside_code = False
# Add the word to the new text len_word = count_chars_without_html(word)
if word.endswith("\n"): if not is_saved_word and (current_length + len_word > x):
new_text += word if is_inside_code:
new_text.append("</code><br><code>")
else: else:
new_text += word + " " new_text.append("<br>")
current_length += len(word) + 1 # Add 1 for the space current_length = 0 # Reset counter
new_text.append(word + " ")
if not is_saved_word:
current_length += len_word + 1 # Add 1 for the space
if word.endswith("\n"): if word == "<li>" or word == "<br>":
current_length = 0 current_length = 0
return new_text.strip() # Remove trailing space
return ''.join(new_text).strip()
def replace_code_tags(text):
"""
Replace odd instances of ` with <code> and even instances of ` with </code>
"""
parts = text.split('`')
for i in range(1, len(parts), 2):
parts[i] = '<code>' + parts[i] + '</code>'
return ''.join(parts)

View File

@ -3,7 +3,7 @@ from unittest.mock import patch
from pr_agent.git_providers.codecommit_provider import CodeCommitFile from pr_agent.git_providers.codecommit_provider import CodeCommitFile
from pr_agent.git_providers.codecommit_provider import CodeCommitProvider from pr_agent.git_providers.codecommit_provider import CodeCommitProvider
from pr_agent.git_providers.codecommit_provider import PullRequestCCMimic from pr_agent.git_providers.codecommit_provider import PullRequestCCMimic
from pr_agent.git_providers.git_provider import EDIT_TYPE from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
class TestCodeCommitFile: class TestCodeCommitFile:

View File

@ -1,5 +1,6 @@
# Generated by CodiumAI # Generated by CodiumAI
from pr_agent.algo.utils import convert_to_markdown from pr_agent.algo.utils import convert_to_markdown
from pr_agent.tools.pr_description import insert_br_after_x_chars
""" """
Code Analysis Code Analysis
@ -93,3 +94,27 @@ class TestConvertToMarkdown:
} }
expected_output = '' expected_output = ''
assert convert_to_markdown(input_data).strip() == expected_output.strip() assert convert_to_markdown(input_data).strip() == expected_output.strip()
class TestBR:
def test_br1(self):
file_change_description = '- Imported `FilePatchInfo` and `EDIT_TYPE` from `pr_agent.algo.types` instead of `pr_agent.git_providers.git_provider`.'
file_change_description_br = insert_br_after_x_chars(file_change_description)
expected_output = ('<li>Imported <code>FilePatchInfo</code> and <code>EDIT_TYPE</code> from '
'<code>pr_agent.algo.types</code> instead <br>of '
'<code>pr_agent.git_providers.git_provider</code>.')
assert file_change_description_br == expected_output
# print("-----")
# print(file_change_description_br)
def test_br2(self):
file_change_description = ('- Created a - new -class `ColorPaletteResourcesCollection ColorPaletteResourcesCollection '
'ColorPaletteResourcesCollection ColorPaletteResourcesCollection`')
file_change_description_br = insert_br_after_x_chars(file_change_description)
expected_output = ('<li>Created a - new -class <code>ColorPaletteResourcesCollection </code><br><code>'
'ColorPaletteResourcesCollection ColorPaletteResourcesCollection <br>'
'ColorPaletteResourcesCollection</code>')
assert file_change_description_br == expected_output
# print("-----")
# print(file_change_description_br)

View File

@ -1,8 +1,7 @@
# Generated by CodiumAI # Generated by CodiumAI
from pr_agent.git_providers.git_provider import FilePatchInfo from pr_agent.algo.types import FilePatchInfo
from pr_agent.algo.pr_processing import find_line_number_of_relevant_line_in_file from pr_agent.algo.utils import find_line_number_of_relevant_line_in_file
import pytest import pytest