Merge pull request #439 from Codium-ai/tr/fixes_added_files

Enhancement of Patch Handling and PR Processing
This commit is contained in:
mrT23
2023-11-08 04:48:07 -08:00
committed by GitHub
8 changed files with 92 additions and 42 deletions

View File

@ -23,7 +23,7 @@ def filter_ignored(files):
# keep filenames that _don't_ match the ignore regex # keep filenames that _don't_ match the ignore regex
for r in compiled_patterns: for r in compiled_patterns:
files = [f for f in files if not r.match(f.filename)] files = [f for f in files if (f.filename and not r.match(f.filename))]
except Exception as e: except Exception as e:
print(f"Could not filter file list: {e}") print(f"Could not filter file list: {e}")

View File

@ -3,6 +3,7 @@ from __future__ import annotations
import re import re
from pr_agent.config_loader import get_settings from pr_agent.config_loader import get_settings
from pr_agent.git_providers.git_provider import EDIT_TYPE
from pr_agent.log import get_logger from pr_agent.log import get_logger
@ -115,7 +116,7 @@ def omit_deletion_hunks(patch_lines) -> str:
def handle_patch_deletions(patch: str, original_file_content_str: str, def handle_patch_deletions(patch: str, original_file_content_str: str,
new_file_content_str: str, file_name: str) -> str: new_file_content_str: str, file_name: str, edit_type: EDIT_TYPE = EDIT_TYPE.UNKNOWN) -> str:
""" """
Handle entire file or deletion patches. Handle entire file or deletion patches.
@ -132,7 +133,7 @@ def handle_patch_deletions(patch: str, original_file_content_str: str,
str: The modified patch with deletion hunks omitted. str: The modified patch with deletion hunks omitted.
""" """
if not new_file_content_str: if not new_file_content_str and edit_type != EDIT_TYPE.ADDED:
# logic for handling deleted files - don't show patch, just show that the file was deleted # logic for handling deleted files - don't show patch, just show that the file was deleted
if get_settings().config.verbosity_level > 0: if get_settings().config.verbosity_level > 0:
get_logger().info(f"Processing file: {file_name}, minimizing deletion file") get_logger().info(f"Processing file: {file_name}, minimizing deletion file")

View File

@ -13,12 +13,14 @@ from pr_agent.algo.file_filter import filter_ignored
from pr_agent.algo.token_handler import TokenHandler, get_token_encoder from pr_agent.algo.token_handler import TokenHandler, get_token_encoder
from pr_agent.algo.utils import get_max_tokens from pr_agent.algo.utils import get_max_tokens
from pr_agent.config_loader import get_settings from pr_agent.config_loader import get_settings
from pr_agent.git_providers.git_provider import FilePatchInfo, GitProvider from pr_agent.git_providers.git_provider import FilePatchInfo, GitProvider, EDIT_TYPE
from pr_agent.log import get_logger from pr_agent.log import get_logger
DELETED_FILES_ = "Deleted files:\n" DELETED_FILES_ = "Deleted files:\n"
MORE_MODIFIED_FILES_ = "More modified files:\n" MORE_MODIFIED_FILES_ = "Additional modified files (insufficient token budget to process):\n"
ADDED_FILES_ = "Additional added files (insufficient token budget to process):\n"
OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD = 1000 OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD = 1000
OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600 OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600
@ -68,10 +70,13 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s
return "\n".join(patches_extended) return "\n".join(patches_extended)
# if we are over the limit, start pruning # if we are over the limit, start pruning
patches_compressed, modified_file_names, deleted_file_names = \ patches_compressed, modified_file_names, deleted_file_names, added_file_names = \
pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks) pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks)
final_diff = "\n".join(patches_compressed) final_diff = "\n".join(patches_compressed)
if added_file_names:
added_list_str = ADDED_FILES_ + "\n".join(added_file_names)
final_diff = final_diff + "\n\n" + added_list_str
if modified_file_names: if modified_file_names:
modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names) modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names)
final_diff = final_diff + "\n\n" + modified_list_str final_diff = final_diff + "\n\n" + modified_list_str
@ -122,7 +127,7 @@ def pr_generate_extended_diff(pr_languages: list,
def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, model: str, def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, model: str,
convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list]: convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list, list]:
""" """
Generate a compressed diff string for a pull request, using diff minimization techniques to reduce the number of Generate a compressed diff string for a pull request, using diff minimization techniques to reduce the number of
tokens used. tokens used.
@ -148,6 +153,7 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
""" """
patches = [] patches = []
added_files_list = []
modified_files_list = [] modified_files_list = []
deleted_files_list = [] deleted_files_list = []
# sort each one of the languages in top_langs by the number of tokens in the diff # sort each one of the languages in top_langs by the number of tokens in the diff
@ -165,7 +171,7 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
# removing delete-only hunks # removing delete-only hunks
patch = handle_patch_deletions(patch, original_file_content_str, patch = handle_patch_deletions(patch, original_file_content_str,
new_file_content_str, file.filename) new_file_content_str, file.filename, file.edit_type)
if patch is None: if patch is None:
if not deleted_files_list: if not deleted_files_list:
total_tokens += token_handler.count_tokens(DELETED_FILES_) total_tokens += token_handler.count_tokens(DELETED_FILES_)
@ -190,6 +196,11 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
# until we meet the requirements # until we meet the requirements
if get_settings().config.verbosity_level >= 2: if get_settings().config.verbosity_level >= 2:
get_logger().warning(f"Patch too large, minimizing it, {file.filename}") get_logger().warning(f"Patch too large, minimizing it, {file.filename}")
if file.edit_type == EDIT_TYPE.ADDED:
if not added_files_list:
total_tokens += token_handler.count_tokens(ADDED_FILES_)
added_files_list.append(file.filename)
else:
if not modified_files_list: if not modified_files_list:
total_tokens += token_handler.count_tokens(MORE_MODIFIED_FILES_) total_tokens += token_handler.count_tokens(MORE_MODIFIED_FILES_)
modified_files_list.append(file.filename) modified_files_list.append(file.filename)
@ -206,7 +217,7 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
if get_settings().config.verbosity_level >= 2: if get_settings().config.verbosity_level >= 2:
get_logger().info(f"Tokens: {total_tokens}, last filename: {file.filename}") get_logger().info(f"Tokens: {total_tokens}, last filename: {file.filename}")
return patches, modified_files_list, deleted_files_list return patches, modified_files_list, deleted_files_list, added_files_list
async def retry_with_fallback_models(f: Callable): async def retry_with_fallback_models(f: Callable):
@ -397,7 +408,7 @@ def get_pr_multi_diffs(git_provider: GitProvider,
continue continue
# Remove delete-only hunks # Remove delete-only hunks
patch = handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file.filename) patch = handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file.filename, file.edit_type)
if patch is None: if patch is None:
continue continue

View File

@ -328,13 +328,16 @@ def set_custom_labels(variables):
variables["custom_labels_examples"] = f" - {list(labels.keys())[0]}" variables["custom_labels_examples"] = f" - {list(labels.keys())[0]}"
def get_user_labels(current_labels): def get_user_labels(current_labels: List[str] = None):
## Only keep labels that has been added by the user """
Only keep labels that has been added by the user
"""
try:
if current_labels is None: if current_labels is None:
current_labels = [] current_labels = []
user_labels = [] user_labels = []
for label in current_labels: for label in current_labels:
if label in ['Bug fix', 'Tests', 'Refactoring', 'Enhancement', 'Documentation', 'Other']: if label.lower() in ['bug fix', 'tests', 'refactoring', 'enhancement', 'documentation', 'other']:
continue continue
if get_settings().config.enable_custom_labels: if get_settings().config.enable_custom_labels:
if label in get_settings().custom_labels: if label in get_settings().custom_labels:
@ -342,6 +345,9 @@ def get_user_labels(current_labels):
user_labels.append(label) user_labels.append(label)
if user_labels: if user_labels:
get_logger().info(f"Keeping user labels: {user_labels}") get_logger().info(f"Keeping user labels: {user_labels}")
except Exception as e:
get_logger().exception(f"Failed to get user labels: {e}")
return current_labels
return user_labels return user_labels

View File

@ -9,7 +9,7 @@ from starlette_context import context
from ..algo.pr_processing import find_line_number_of_relevant_line_in_file from ..algo.pr_processing import find_line_number_of_relevant_line_in_file
from ..config_loader import get_settings from ..config_loader import get_settings
from ..log import get_logger from ..log import get_logger
from .git_provider import FilePatchInfo, GitProvider from .git_provider import FilePatchInfo, GitProvider, EDIT_TYPE
class BitbucketProvider(GitProvider): class BitbucketProvider(GitProvider):
@ -132,14 +132,24 @@ class BitbucketProvider(GitProvider):
diff.old.get_data("links") diff.old.get_data("links")
) )
new_file_content_str = self._get_pr_file_content(diff.new.get_data("links")) new_file_content_str = self._get_pr_file_content(diff.new.get_data("links"))
diff_files.append( file_patch_canonic_structure = FilePatchInfo(
FilePatchInfo(
original_file_content_str, original_file_content_str,
new_file_content_str, new_file_content_str,
diff_split[index], diff_split[index],
diff.new.path, diff.new.path,
) )
)
if diff.data['status'] == 'added':
file_patch_canonic_structure.edit_type = EDIT_TYPE.ADDED
elif diff.data['status'] == 'removed':
file_patch_canonic_structure.edit_type = EDIT_TYPE.DELETED
elif diff.data['status'] == 'modified':
file_patch_canonic_structure.edit_type = EDIT_TYPE.MODIFIED
elif diff.data['status'] == 'renamed':
file_patch_canonic_structure.edit_type = EDIT_TYPE.RENAMED
diff_files.append(file_patch_canonic_structure)
self.diff_files = diff_files self.diff_files = diff_files
return diff_files return diff_files
@ -288,6 +298,11 @@ class BitbucketProvider(GitProvider):
}) })
response = requests.request("PUT", self.bitbucket_pull_request_api_url, headers=self.headers, data=payload) response = requests.request("PUT", self.bitbucket_pull_request_api_url, headers=self.headers, data=payload)
try:
if response.status_code != 200:
get_logger().info(f"Failed to update description, error code: {response.status_code}")
except:
pass
return response return response
# bitbucket does not support labels # bitbucket does not support labels

View File

@ -13,6 +13,7 @@ class EDIT_TYPE(Enum):
DELETED = 2 DELETED = 2
MODIFIED = 3 MODIFIED = 3
RENAMED = 4 RENAMED = 4
UNKNOWN = 5
@dataclass @dataclass
@ -22,7 +23,7 @@ class FilePatchInfo:
patch: str patch: str
filename: str filename: str
tokens: int = -1 tokens: int = -1
edit_type: EDIT_TYPE = EDIT_TYPE.MODIFIED edit_type: EDIT_TYPE = EDIT_TYPE.UNKNOWN
old_filename: str = None old_filename: str = None
@ -94,10 +95,10 @@ class GitProvider(ABC):
def get_pr_description(self, *, full: bool = True) -> str: def get_pr_description(self, *, full: bool = True) -> str:
from pr_agent.config_loader import get_settings from pr_agent.config_loader import get_settings
from pr_agent.algo.pr_processing import clip_tokens from pr_agent.algo.pr_processing import clip_tokens
max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None) max_tokens_description = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None)
description = self.get_pr_description_full() if full else self.get_user_description() description = self.get_pr_description_full() if full else self.get_user_description()
if max_tokens: if max_tokens_description:
return clip_tokens(description, max_tokens) return clip_tokens(description, max_tokens_description)
return description return description
def get_user_description(self) -> str: def get_user_description(self) -> str:
@ -153,6 +154,8 @@ def get_main_pr_language(languages, files) -> str:
# validate that the specific commit uses the main language # validate that the specific commit uses the main language
extension_list = [] extension_list = []
for file in files: for file in files:
if not file:
continue
if isinstance(file, str): if isinstance(file, str):
file = FilePatchInfo(base_file=None, head_file=None, patch=None, filename=file) file = FilePatchInfo(base_file=None, head_file=None, patch=None, filename=file)
extension_list.append(file.filename.rsplit('.')[-1]) extension_list.append(file.filename.rsplit('.')[-1])

View File

@ -13,7 +13,7 @@ from ..algo.utils import load_large_diff
from ..config_loader import get_settings from ..config_loader import get_settings
from ..log import get_logger from ..log import get_logger
from ..servers.utils import RateLimitExceeded from ..servers.utils import RateLimitExceeded
from .git_provider import FilePatchInfo, GitProvider, IncrementalPR from .git_provider import FilePatchInfo, GitProvider, IncrementalPR, EDIT_TYPE
class GithubProvider(GitProvider): class GithubProvider(GitProvider):
@ -129,7 +129,20 @@ class GithubProvider(GitProvider):
if not patch: if not patch:
patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str) patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str)
diff_files.append(FilePatchInfo(original_file_content_str, new_file_content_str, patch, file.filename)) if file.status == 'added':
edit_type = EDIT_TYPE.ADDED
elif file.status == 'removed':
edit_type = EDIT_TYPE.DELETED
elif file.status == 'renamed':
edit_type = EDIT_TYPE.RENAMED
elif file.status == 'modified':
edit_type = EDIT_TYPE.MODIFIED
else:
get_logger().error(f"Unknown edit type: {file.status}")
edit_type = EDIT_TYPE.UNKNOWN
file_patch_canonical_structure = FilePatchInfo(original_file_content_str, new_file_content_str, patch,
file.filename, edit_type=edit_type)
diff_files.append(file_patch_canonical_structure)
self.diff_files = diff_files self.diff_files = diff_files
return diff_files return diff_files

View File

@ -152,7 +152,8 @@ PR Analysis:
Focused PR: no, because ... Focused PR: no, because ...
{%- endif %} {%- endif %}
{%- if require_estimate_effort_to_review %} {%- if require_estimate_effort_to_review %}
Estimated effort to review [1-5]: 3, because ... Estimated effort to review [1-5]: |-
3, because ...
{%- endif %} {%- endif %}
PR Feedback: PR Feedback:
General PR suggestions: |- General PR suggestions: |-