From 0ce42e786ef573946dff0f633794253f0e03b2e3 Mon Sep 17 00:00:00 2001 From: "Hussam.lawen" Date: Thu, 6 Jul 2023 11:12:41 +0300 Subject: [PATCH 1/5] Combine all modified file that been compressed into one list at the end of the PR --- pr_agent/algo/pr_processing.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 68120733..32811feb 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -32,8 +32,12 @@ def get_pr_diff(git_provider: [GithubProvider, Any], token_handler: TokenHandler return "\n".join(patches_extended) # if we are over the limit, start pruning - patches_compressed = pr_generate_compressed_diff(pr_languages, token_handler) - return "\n".join(patches_compressed) + patches_compressed, modified_file_names = pr_generate_compressed_diff(pr_languages, token_handler) + final_diff = "\n".join(patches_compressed) + if modified_file_names: + modified_list_str = "Modified files:\n" + "\n".join(modified_file_names) + final_diff = final_diff + "\n\n" + modified_list_str + return final_diff def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -> \ @@ -67,7 +71,7 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) - return patches_extended, total_tokens -def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> list: +def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> (list, list): # Apply Diff Minimization techniques to reduce the number of tokens: # 0. Start from the largest diff patch to smaller ones # 1. Don't use extend context lines around diff @@ -76,7 +80,7 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> # 4. Minimize all remaining files when you reach token limit patches = [] - + modified_files = [] # sort each one of the languages in top_langs by the number of tokens in the diff sorted_files = [] for lang in top_langs: @@ -105,14 +109,16 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> # until we meet the requirements if settings.config.verbosity_level >= 2: logging.warning(f"Patch too large, minimizing it, {file.filename}") - patch = "File was modified" + patch = None + modified_files.append(file.filename) if patch: patch_final = f"## {file.filename}\n\n{patch}\n" patches.append(patch_final) total_tokens += token_handler.count_tokens(patch_final) if settings.config.verbosity_level >= 2: logging.info(f"Tokens: {total_tokens}, last filename: {file.filename}") - return patches + + return patches, modified_files def load_large_diff(file, new_file_content_str: str, original_file_content_str: str, patch: str) -> str: From 1a626fb1f31e973e1e73cb3bdca99ea24ec0be1a Mon Sep 17 00:00:00 2001 From: "Hussam.lawen" Date: Thu, 6 Jul 2023 11:23:38 +0300 Subject: [PATCH 2/5] change "modified files" to "more modified files" --- pr_agent/algo/pr_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 32811feb..686a7130 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -35,7 +35,7 @@ def get_pr_diff(git_provider: [GithubProvider, Any], token_handler: TokenHandler patches_compressed, modified_file_names = pr_generate_compressed_diff(pr_languages, token_handler) final_diff = "\n".join(patches_compressed) if modified_file_names: - modified_list_str = "Modified files:\n" + "\n".join(modified_file_names) + modified_list_str = "More modified files:\n" + "\n".join(modified_file_names) final_diff = final_diff + "\n\n" + modified_list_str return final_diff From 795f6ab8d57eb0aefe54dfaef95071bfaeb5e15a Mon Sep 17 00:00:00 2001 From: "Hussam.lawen" Date: Thu, 6 Jul 2023 12:21:27 +0300 Subject: [PATCH 3/5] Add deleted files section and count their tokens --- pr_agent/algo/git_patch_processing.py | 2 +- pr_agent/algo/pr_processing.py | 25 +++++++++++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py index b9d1fe1a..d75c6c97 100644 --- a/pr_agent/algo/git_patch_processing.py +++ b/pr_agent/algo/git_patch_processing.py @@ -96,7 +96,7 @@ def handle_patch_deletions(patch: str, original_file_content_str: str, # logic for handling deleted files - don't show patch, just show that the file was deleted if settings.config.verbosity_level > 0: logging.info(f"Processing file: {file_name}, minimizing deletion file") - patch = "File was deleted\n" + patch = None # file was deleted else: patch_lines = patch.splitlines() patch_new = omit_deletion_hunks(patch_lines) diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 686a7130..8def97fa 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -10,6 +10,10 @@ from pr_agent.algo.token_handler import TokenHandler from pr_agent.config_loader import settings from pr_agent.git_providers import GithubProvider +DELETED_FILES_ = "Deleted files:\n" + +MORE_MODIFIED_FILES_ = "More modified files:\n" + OUTPUT_BUFFER_TOKENS = 800 PATCH_EXTRA_LINES = 3 @@ -32,11 +36,14 @@ def get_pr_diff(git_provider: [GithubProvider, Any], token_handler: TokenHandler return "\n".join(patches_extended) # if we are over the limit, start pruning - patches_compressed, modified_file_names = pr_generate_compressed_diff(pr_languages, token_handler) + patches_compressed, modified_file_names, deleted_file_names = pr_generate_compressed_diff(pr_languages, token_handler) final_diff = "\n".join(patches_compressed) if modified_file_names: - modified_list_str = "More modified files:\n" + "\n".join(modified_file_names) + modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names) final_diff = final_diff + "\n\n" + modified_list_str + if deleted_file_names: + deleted_list_str = DELETED_FILES_ + "\n".join(deleted_file_names) + final_diff = final_diff + "\n\n" + deleted_list_str return final_diff @@ -71,7 +78,7 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) - return patches_extended, total_tokens -def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> (list, list): +def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> (list, list, list): # Apply Diff Minimization techniques to reduce the number of tokens: # 0. Start from the largest diff patch to smaller ones # 1. Don't use extend context lines around diff @@ -81,6 +88,7 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> patches = [] modified_files = [] + deleted_files = [] # sort each one of the languages in top_langs by the number of tokens in the diff sorted_files = [] for lang in top_langs: @@ -98,6 +106,12 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> # removing delete-only hunks patch = handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file.filename) + if patch is None: + if not deleted_files: + total_tokens += token_handler.count_tokens(DELETED_FILES_) + deleted_files.append(file.filename) + total_tokens += token_handler.count_tokens(file.filename) + 1 + continue new_patch_tokens = token_handler.count_tokens(patch) if total_tokens > token_handler.limit - OUTPUT_BUFFER_TOKENS // 2: @@ -110,7 +124,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> if settings.config.verbosity_level >= 2: logging.warning(f"Patch too large, minimizing it, {file.filename}") patch = None + if not modified_files: + total_tokens += token_handler.count_tokens(MORE_MODIFIED_FILES_) modified_files.append(file.filename) + total_tokens += token_handler.count_tokens(file.filename) + 1 if patch: patch_final = f"## {file.filename}\n\n{patch}\n" patches.append(patch_final) @@ -118,7 +135,7 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> if settings.config.verbosity_level >= 2: logging.info(f"Tokens: {total_tokens}, last filename: {file.filename}") - return patches, modified_files + return patches, modified_files, deleted_files def load_large_diff(file, new_file_content_str: str, original_file_content_str: str, patch: str) -> str: From 542c4599ba9dc12b78211464f1faf0432fd9ef2b Mon Sep 17 00:00:00 2001 From: "Hussam.lawen" Date: Thu, 6 Jul 2023 12:36:25 +0300 Subject: [PATCH 4/5] fix tests --- tests/unit/test_handle_patch_deletions.py | 2 +- tests/unit/{test_language_handler => test_language_handler.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tests/unit/{test_language_handler => test_language_handler.py} (100%) diff --git a/tests/unit/test_handle_patch_deletions.py b/tests/unit/test_handle_patch_deletions.py index afb30dd6..95ab8674 100644 --- a/tests/unit/test_handle_patch_deletions.py +++ b/tests/unit/test_handle_patch_deletions.py @@ -62,7 +62,7 @@ class TestHandlePatchDeletions: new_file_content_str = '' file_name = 'file.py' assert handle_patch_deletions(patch, original_file_content_str, new_file_content_str, - file_name) == 'File was deleted\n' + file_name) is None # Tests that handle_patch_deletions returns the original patch when patch and patch_new are equal def test_handle_patch_deletions_edge_case_patch_and_patch_new_are_equal(self): diff --git a/tests/unit/test_language_handler b/tests/unit/test_language_handler.py similarity index 100% rename from tests/unit/test_language_handler rename to tests/unit/test_language_handler.py From ff720d32fe0a6bc1422de27162f9b5ca5d90cd97 Mon Sep 17 00:00:00 2001 From: "Hussam.lawen" Date: Thu, 6 Jul 2023 13:20:08 +0300 Subject: [PATCH 5/5] pylance --- pr_agent/algo/pr_processing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 8def97fa..3ad45bf2 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -2,7 +2,7 @@ from __future__ import annotations import difflib import logging -from typing import Any, Dict, Tuple +from typing import Any, Dict, Tuple, Union from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions from pr_agent.algo.language_handler import sort_files_by_main_languages @@ -18,7 +18,7 @@ OUTPUT_BUFFER_TOKENS = 800 PATCH_EXTRA_LINES = 3 -def get_pr_diff(git_provider: [GithubProvider, Any], token_handler: TokenHandler) -> str: +def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler) -> str: """ Returns a string with the diff of the PR. If needed, apply diff minimization techniques to reduce the number of tokens @@ -78,7 +78,7 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) - return patches_extended, total_tokens -def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> (list, list, list): +def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> Tuple(list, list, list): # Apply Diff Minimization techniques to reduce the number of tokens: # 0. Start from the largest diff patch to smaller ones # 1. Don't use extend context lines around diff