diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py index b9d1fe1a..d75c6c97 100644 --- a/pr_agent/algo/git_patch_processing.py +++ b/pr_agent/algo/git_patch_processing.py @@ -96,7 +96,7 @@ def handle_patch_deletions(patch: str, original_file_content_str: str, # logic for handling deleted files - don't show patch, just show that the file was deleted if settings.config.verbosity_level > 0: logging.info(f"Processing file: {file_name}, minimizing deletion file") - patch = "File was deleted\n" + patch = None # file was deleted else: patch_lines = patch.splitlines() patch_new = omit_deletion_hunks(patch_lines) diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 68120733..3ad45bf2 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -2,7 +2,7 @@ from __future__ import annotations import difflib import logging -from typing import Any, Dict, Tuple +from typing import Any, Dict, Tuple, Union from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions from pr_agent.algo.language_handler import sort_files_by_main_languages @@ -10,11 +10,15 @@ from pr_agent.algo.token_handler import TokenHandler from pr_agent.config_loader import settings from pr_agent.git_providers import GithubProvider +DELETED_FILES_ = "Deleted files:\n" + +MORE_MODIFIED_FILES_ = "More modified files:\n" + OUTPUT_BUFFER_TOKENS = 800 PATCH_EXTRA_LINES = 3 -def get_pr_diff(git_provider: [GithubProvider, Any], token_handler: TokenHandler) -> str: +def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler) -> str: """ Returns a string with the diff of the PR. If needed, apply diff minimization techniques to reduce the number of tokens @@ -32,8 +36,15 @@ def get_pr_diff(git_provider: [GithubProvider, Any], token_handler: TokenHandler return "\n".join(patches_extended) # if we are over the limit, start pruning - patches_compressed = pr_generate_compressed_diff(pr_languages, token_handler) - return "\n".join(patches_compressed) + patches_compressed, modified_file_names, deleted_file_names = pr_generate_compressed_diff(pr_languages, token_handler) + final_diff = "\n".join(patches_compressed) + if modified_file_names: + modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names) + final_diff = final_diff + "\n\n" + modified_list_str + if deleted_file_names: + deleted_list_str = DELETED_FILES_ + "\n".join(deleted_file_names) + final_diff = final_diff + "\n\n" + deleted_list_str + return final_diff def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -> \ @@ -67,7 +78,7 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) - return patches_extended, total_tokens -def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> list: +def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> Tuple(list, list, list): # Apply Diff Minimization techniques to reduce the number of tokens: # 0. Start from the largest diff patch to smaller ones # 1. Don't use extend context lines around diff @@ -76,7 +87,8 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> # 4. Minimize all remaining files when you reach token limit patches = [] - + modified_files = [] + deleted_files = [] # sort each one of the languages in top_langs by the number of tokens in the diff sorted_files = [] for lang in top_langs: @@ -94,6 +106,12 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> # removing delete-only hunks patch = handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file.filename) + if patch is None: + if not deleted_files: + total_tokens += token_handler.count_tokens(DELETED_FILES_) + deleted_files.append(file.filename) + total_tokens += token_handler.count_tokens(file.filename) + 1 + continue new_patch_tokens = token_handler.count_tokens(patch) if total_tokens > token_handler.limit - OUTPUT_BUFFER_TOKENS // 2: @@ -105,14 +123,19 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> # until we meet the requirements if settings.config.verbosity_level >= 2: logging.warning(f"Patch too large, minimizing it, {file.filename}") - patch = "File was modified" + patch = None + if not modified_files: + total_tokens += token_handler.count_tokens(MORE_MODIFIED_FILES_) + modified_files.append(file.filename) + total_tokens += token_handler.count_tokens(file.filename) + 1 if patch: patch_final = f"## {file.filename}\n\n{patch}\n" patches.append(patch_final) total_tokens += token_handler.count_tokens(patch_final) if settings.config.verbosity_level >= 2: logging.info(f"Tokens: {total_tokens}, last filename: {file.filename}") - return patches + + return patches, modified_files, deleted_files def load_large_diff(file, new_file_content_str: str, original_file_content_str: str, patch: str) -> str: diff --git a/tests/unit/test_handle_patch_deletions.py b/tests/unit/test_handle_patch_deletions.py index afb30dd6..95ab8674 100644 --- a/tests/unit/test_handle_patch_deletions.py +++ b/tests/unit/test_handle_patch_deletions.py @@ -62,7 +62,7 @@ class TestHandlePatchDeletions: new_file_content_str = '' file_name = 'file.py' assert handle_patch_deletions(patch, original_file_content_str, new_file_content_str, - file_name) == 'File was deleted\n' + file_name) is None # Tests that handle_patch_deletions returns the original patch when patch and patch_new are equal def test_handle_patch_deletions_edge_case_patch_and_patch_new_are_equal(self): diff --git a/tests/unit/test_language_handler b/tests/unit/test_language_handler.py similarity index 100% rename from tests/unit/test_language_handler rename to tests/unit/test_language_handler.py