From 795f6ab8d57eb0aefe54dfaef95071bfaeb5e15a Mon Sep 17 00:00:00 2001 From: "Hussam.lawen" Date: Thu, 6 Jul 2023 12:21:27 +0300 Subject: [PATCH] Add deleted files section and count their tokens --- pr_agent/algo/git_patch_processing.py | 2 +- pr_agent/algo/pr_processing.py | 25 +++++++++++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py index b9d1fe1a..d75c6c97 100644 --- a/pr_agent/algo/git_patch_processing.py +++ b/pr_agent/algo/git_patch_processing.py @@ -96,7 +96,7 @@ def handle_patch_deletions(patch: str, original_file_content_str: str, # logic for handling deleted files - don't show patch, just show that the file was deleted if settings.config.verbosity_level > 0: logging.info(f"Processing file: {file_name}, minimizing deletion file") - patch = "File was deleted\n" + patch = None # file was deleted else: patch_lines = patch.splitlines() patch_new = omit_deletion_hunks(patch_lines) diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 686a7130..8def97fa 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -10,6 +10,10 @@ from pr_agent.algo.token_handler import TokenHandler from pr_agent.config_loader import settings from pr_agent.git_providers import GithubProvider +DELETED_FILES_ = "Deleted files:\n" + +MORE_MODIFIED_FILES_ = "More modified files:\n" + OUTPUT_BUFFER_TOKENS = 800 PATCH_EXTRA_LINES = 3 @@ -32,11 +36,14 @@ def get_pr_diff(git_provider: [GithubProvider, Any], token_handler: TokenHandler return "\n".join(patches_extended) # if we are over the limit, start pruning - patches_compressed, modified_file_names = pr_generate_compressed_diff(pr_languages, token_handler) + patches_compressed, modified_file_names, deleted_file_names = pr_generate_compressed_diff(pr_languages, token_handler) final_diff = "\n".join(patches_compressed) if modified_file_names: - modified_list_str = "More modified files:\n" + "\n".join(modified_file_names) + modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names) final_diff = final_diff + "\n\n" + modified_list_str + if deleted_file_names: + deleted_list_str = DELETED_FILES_ + "\n".join(deleted_file_names) + final_diff = final_diff + "\n\n" + deleted_list_str return final_diff @@ -71,7 +78,7 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) - return patches_extended, total_tokens -def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> (list, list): +def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> (list, list, list): # Apply Diff Minimization techniques to reduce the number of tokens: # 0. Start from the largest diff patch to smaller ones # 1. Don't use extend context lines around diff @@ -81,6 +88,7 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> patches = [] modified_files = [] + deleted_files = [] # sort each one of the languages in top_langs by the number of tokens in the diff sorted_files = [] for lang in top_langs: @@ -98,6 +106,12 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> # removing delete-only hunks patch = handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file.filename) + if patch is None: + if not deleted_files: + total_tokens += token_handler.count_tokens(DELETED_FILES_) + deleted_files.append(file.filename) + total_tokens += token_handler.count_tokens(file.filename) + 1 + continue new_patch_tokens = token_handler.count_tokens(patch) if total_tokens > token_handler.limit - OUTPUT_BUFFER_TOKENS // 2: @@ -110,7 +124,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> if settings.config.verbosity_level >= 2: logging.warning(f"Patch too large, minimizing it, {file.filename}") patch = None + if not modified_files: + total_tokens += token_handler.count_tokens(MORE_MODIFIED_FILES_) modified_files.append(file.filename) + total_tokens += token_handler.count_tokens(file.filename) + 1 if patch: patch_final = f"## {file.filename}\n\n{patch}\n" patches.append(patch_final) @@ -118,7 +135,7 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> if settings.config.verbosity_level >= 2: logging.info(f"Tokens: {total_tokens}, last filename: {file.filename}") - return patches, modified_files + return patches, modified_files, deleted_files def load_large_diff(file, new_file_content_str: str, original_file_content_str: str, patch: str) -> str: