Merge pull request #1004 from Codium-ai/tr/large_pr

Tr/large pr
2025-07-21 04:50:39 +08:00 · 2024-06-27 08:37:02 +03:00
parent b1f728e6b0 11abce3ede
commit 056eb3a954
6 changed files with 385 additions and 156 deletions
--- a/docs/docs/tools/describe.md
+++ b/docs/docs/tools/describe.md
@ -87,6 +87,10 @@ publish_labels = ...
    <td><b>collapsible_file_list</b></td>
    <td>If set to true, the file list in the "Changes walkthrough" section will be collapsible. If set to "adaptive", the file list will be collapsible only if there are more than 8 files. Default is "adaptive".</td>
  </tr>
+  <tr>
+    <td><b>enable_large_pr_handling</b></td>
+    <td>Pro feature. If set to true, in case of a large PR the tool will make several calls to the AI and combine them to be able to cover more files. Default is true.</td>
+  </tr>
  <tr>
    <td><b>enable_help_text</b></td>
    <td>If set to true, the tool will display a help text in the comment. Default is false.</td>
--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@ -24,26 +24,10 @@ ADDED_FILES_ = "Additional added files (insufficient token budget to process):\n
 OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD = 1000
 OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600

+
+
 def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: str,
-                add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False) -> str:
-    """
-    Returns a string with the diff of the pull request, applying diff minimization techniques if needed.
-
-    Args:
-        git_provider (GitProvider): An object of the GitProvider class representing the Git provider used for the pull
-        request.
-        token_handler (TokenHandler): An object of the TokenHandler class used for handling tokens in the context of the
-        pull request.
-        model (str): The name of the model used for tokenization.
-        add_line_numbers_to_hunks (bool, optional): A boolean indicating whether to add line numbers to the hunks in the
-        diff. Defaults to False.
-        disable_extra_lines (bool, optional): A boolean indicating whether to disable the extension of each patch with
-        extra lines of context. Defaults to False.
-
-    Returns:
-        str: A string with the diff of the pull request, applying diff minimization techniques if needed.
-    """
-
+                add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False, large_pr_handling=False) -> str:
    if disable_extra_lines:
        PATCH_EXTRA_LINES = 0
    else:
@ -87,39 +71,99 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s
    # if we are over the limit, start pruning
    get_logger().info(f"Tokens: {total_tokens}, total tokens over limit: {get_max_tokens(model)}, "
                      f"pruning diff.")
-    patches_compressed, modified_file_names, deleted_file_names, added_file_names, total_tokens_new = \
+    patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list = \
        pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks)

+    if large_pr_handling and len(patches_compressed_list) > 1:
+        get_logger().info(f"Large PR handling mode, and found {len(patches_compressed_list)} patches with original diff.")
+        return "" # return empty string, as we generate multiple patches with a different prompt
+
+    # return the first patch
+    patches_compressed = patches_compressed_list[0]
+    total_tokens_new = total_tokens_list[0]
+    files_in_patch = files_in_patches_list[0]
+
    # Insert additional information about added, modified, and deleted files if there is enough space
    max_tokens = get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD
    curr_token = total_tokens_new  # == token_handler.count_tokens(final_diff)+token_handler.prompt_tokens
    final_diff = "\n".join(patches_compressed)
    delta_tokens = 10
-    if added_file_names and (max_tokens - curr_token) > delta_tokens:
-        added_list_str = ADDED_FILES_ + "\n".join(added_file_names)
-        added_list_str = clip_tokens(added_list_str, max_tokens - curr_token)
-        if added_list_str:
-            final_diff = final_diff + "\n\n" + added_list_str
-            curr_token += token_handler.count_tokens(added_list_str) + 2
-    if modified_file_names and (max_tokens - curr_token) > delta_tokens:
-        modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names)
-        modified_list_str = clip_tokens(modified_list_str, max_tokens - curr_token)
-        if modified_list_str:
-            final_diff = final_diff + "\n\n" + modified_list_str
-            curr_token += token_handler.count_tokens(modified_list_str) + 2
-    if deleted_file_names and (max_tokens - curr_token) > delta_tokens:
-        deleted_list_str = DELETED_FILES_ + "\n".join(deleted_file_names)
-        deleted_list_str = clip_tokens(deleted_list_str, max_tokens - curr_token)
-        if deleted_list_str:
-            final_diff = final_diff + "\n\n" + deleted_list_str
-    try:
-        get_logger().debug(f"After pruning, added_list_str: {added_list_str}, modified_list_str: {modified_list_str}, "
-                           f"deleted_list_str: {deleted_list_str}")
-    except Exception as e:
-        pass
+    added_list_str = modified_list_str = deleted_list_str = ""
+    unprocessed_files = []
+    # generate the added, modified, and deleted files lists
+    if (max_tokens - curr_token) > delta_tokens:
+        for filename, file_values in file_dict.items():
+            if filename in files_in_patch:
+                continue
+            if file_values['edit_type'] == EDIT_TYPE.ADDED:
+                unprocessed_files.append(filename)
+                if not added_list_str:
+                    added_list_str = ADDED_FILES_ + f"\n{filename}"
+                else:
+                    added_list_str = added_list_str + f"\n{filename}"
+            elif file_values['edit_type'] == EDIT_TYPE.MODIFIED or EDIT_TYPE.RENAMED:
+                unprocessed_files.append(filename)
+                if not modified_list_str:
+                    modified_list_str = MORE_MODIFIED_FILES_ + f"\n{filename}"
+                else:
+                    modified_list_str = modified_list_str + f"\n{filename}"
+            elif file_values['edit_type'] == EDIT_TYPE.DELETED:
+                # unprocessed_files.append(filename) # not needed here, because the file was deleted, so no need to process it
+                if not deleted_list_str:
+                    deleted_list_str = DELETED_FILES_ + f"\n{filename}"
+                else:
+                    deleted_list_str = deleted_list_str + f"\n{filename}"
+
+    # prune the added, modified, and deleted files lists, and add them to the final diff
+    added_list_str = clip_tokens(added_list_str, max_tokens - curr_token)
+    if added_list_str:
+        final_diff = final_diff + "\n\n" + added_list_str
+        curr_token += token_handler.count_tokens(added_list_str) + 2
+    modified_list_str = clip_tokens(modified_list_str, max_tokens - curr_token)
+    if modified_list_str:
+        final_diff = final_diff + "\n\n" + modified_list_str
+        curr_token += token_handler.count_tokens(modified_list_str) + 2
+    deleted_list_str = clip_tokens(deleted_list_str, max_tokens - curr_token)
+    if deleted_list_str:
+        final_diff = final_diff + "\n\n" + deleted_list_str
+
+    get_logger().debug(f"After pruning, added_list_str: {added_list_str}, modified_list_str: {modified_list_str}, "
+                       f"deleted_list_str: {deleted_list_str}")
    return final_diff


+def get_pr_diff_multiple_patchs(git_provider: GitProvider, token_handler: TokenHandler, model: str,
+                add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False):
+    try:
+        diff_files_original = git_provider.get_diff_files()
+    except RateLimitExceededException as e:
+        get_logger().error(f"Rate limit exceeded for git provider API. original message {e}")
+        raise
+
+    diff_files = filter_ignored(diff_files_original)
+    if diff_files != diff_files_original:
+        try:
+            get_logger().info(f"Filtered out {len(diff_files_original) - len(diff_files)} files")
+            new_names = set([a.filename for a in diff_files])
+            orig_names = set([a.filename for a in diff_files_original])
+            get_logger().info(f"Filtered out files: {orig_names - new_names}")
+        except Exception as e:
+            pass
+
+    # get pr languages
+    pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files)
+    if pr_languages:
+        try:
+            get_logger().info(f"PR main language: {pr_languages[0]['language']}")
+        except Exception as e:
+            pass
+
+    patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list = \
+        pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks)
+
+    return patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list
+
+
 def pr_generate_extended_diff(pr_languages: list,
                              token_handler: TokenHandler,
                              add_line_numbers_to_hunks: bool,
@ -164,41 +208,16 @@ def pr_generate_extended_diff(pr_languages: list,


 def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, model: str,
-                                convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list, list, int]:
-    """
-    Generate a compressed diff string for a pull request, using diff minimization techniques to reduce the number of
-    tokens used.
-    Args:
-        top_langs (list): A list of dictionaries representing the languages used in the pull request and their
-        corresponding files.
-        token_handler (TokenHandler): An object of the TokenHandler class used for handling tokens in the context of the
-        pull request.
-        model (str): The model used for tokenization.
-        convert_hunks_to_line_numbers (bool): A boolean indicating whether to convert hunks to line numbers in the diff.
-    Returns:
-        Tuple[list, list, list]: A tuple containing the following lists:
-            - patches: A list of compressed diff patches for each file in the pull request.
-            - modified_files_list: A list of file names that were skipped due to large patch size.
-            - deleted_files_list: A list of file names that were deleted in the pull request.
-
-    Minimization techniques to reduce the number of tokens:
-    0. Start from the largest diff patch to smaller ones
-    1. Don't use extend context lines around diff
-    2. Minimize deleted files
-    3. Minimize deleted hunks
-    4. Minimize all remaining files when you reach token limit
-    """
-
-    patches = []
-    added_files_list = []
-    modified_files_list = []
+                                convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list, list, dict, list]:
    deleted_files_list = []
+
    # sort each one of the languages in top_langs by the number of tokens in the diff
    sorted_files = []
    for lang in top_langs:
        sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True))

-    total_tokens = token_handler.prompt_tokens
+    # generate patches for each file, and count tokens
+    file_dict = {}
    for file in sorted_files:
        original_file_content_str = file.base_file
        new_file_content_str = file.head_file
@ -210,55 +229,85 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
        patch = handle_patch_deletions(patch, original_file_content_str,
                                       new_file_content_str, file.filename, file.edit_type)
        if patch is None:
-            # if not deleted_files_list:
-            #     total_tokens += token_handler.count_tokens(DELETED_FILES_)
            if file.filename not in deleted_files_list:
                deleted_files_list.append(file.filename)
-            # total_tokens += token_handler.count_tokens(file.filename) + 1
            continue

        if convert_hunks_to_line_numbers:
            patch = convert_to_hunks_with_lines_numbers(patch, file)

        new_patch_tokens = token_handler.count_tokens(patch)
+        file_dict[file.filename] = {'patch': patch, 'tokens': new_patch_tokens, 'edit_type': file.edit_type}
+
+    max_tokens_model = get_max_tokens(model)
+
+    # first iteration
+    files_in_patches_list = []
+    remaining_files_list =  [file.filename for file in sorted_files]
+    patches_list =[]
+    total_tokens_list = []
+    total_tokens, patches, remaining_files_list, files_in_patch_list = generate_full_patch(convert_hunks_to_line_numbers, file_dict,
+                                       max_tokens_model, remaining_files_list, token_handler)
+    patches_list.append(patches)
+    total_tokens_list.append(total_tokens)
+    files_in_patches_list.append(files_in_patch_list)
+
+    # additional iterations (if needed)
+    NUMBER_OF_ALLOWED_ITERATIONS = get_settings().pr_description.max_ai_calls - 1 # one more call is to summarize
+    for i in range(NUMBER_OF_ALLOWED_ITERATIONS-1):
+        if remaining_files_list:
+            total_tokens, patches, remaining_files_list, files_in_patch_list = generate_full_patch(convert_hunks_to_line_numbers,
+                                                                             file_dict,
+                                                                              max_tokens_model,
+                                                                              remaining_files_list, token_handler)
+            patches_list.append(patches)
+            total_tokens_list.append(total_tokens)
+            files_in_patches_list.append(files_in_patch_list)
+        else:
+            break
+
+    return patches_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list
+
+
+def generate_full_patch(convert_hunks_to_line_numbers, file_dict, max_tokens_model,remaining_files_list_prev, token_handler):
+    total_tokens = token_handler.prompt_tokens # initial tokens
+    patches = []
+    remaining_files_list_new = []
+    files_in_patch_list = []
+    for filename, data in file_dict.items():
+        if filename not in remaining_files_list_prev:
+            continue
+
+        patch = data['patch']
+        new_patch_tokens = data['tokens']
+        edit_type = data['edit_type']

        # Hard Stop, no more tokens
-        if total_tokens > get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD:
-            get_logger().warning(f"File was fully skipped, no more tokens: {file.filename}.")
+        if total_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD:
+            get_logger().warning(f"File was fully skipped, no more tokens: {filename}.")
            continue

        # If the patch is too large, just show the file name
-        if total_tokens + new_patch_tokens > get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD:
+        if total_tokens + new_patch_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD:
            # Current logic is to skip the patch if it's too large
            # TODO: Option for alternative logic to remove hunks from the patch to reduce the number of tokens
            #  until we meet the requirements
            if get_settings().config.verbosity_level >= 2:
-                get_logger().warning(f"Patch too large, minimizing it, {file.filename}")
-            if file.edit_type == EDIT_TYPE.ADDED:
-                # if not added_files_list:
-                #     total_tokens += token_handler.count_tokens(ADDED_FILES_)
-                if file.filename not in added_files_list:
-                    added_files_list.append(file.filename)
-                # total_tokens += token_handler.count_tokens(file.filename) + 1
-            else:
-                # if not modified_files_list:
-                #     total_tokens += token_handler.count_tokens(MORE_MODIFIED_FILES_)
-                if file.filename not in modified_files_list:
-                    modified_files_list.append(file.filename)
-                # total_tokens += token_handler.count_tokens(file.filename) + 1
+                get_logger().warning(f"Patch too large, skipping it, {filename}")
+            remaining_files_list_new.append(filename)
            continue

        if patch:
            if not convert_hunks_to_line_numbers:
-                patch_final = f"\n\n## file: '{file.filename.strip()}\n\n{patch.strip()}\n'"
+                patch_final = f"\n\n## file: '{filename.strip()}\n\n{patch.strip()}\n'"
            else:
                patch_final = "\n\n" + patch.strip()
            patches.append(patch_final)
            total_tokens += token_handler.count_tokens(patch_final)
+            files_in_patch_list.append(filename)
            if get_settings().config.verbosity_level >= 2:
-                get_logger().info(f"Tokens: {total_tokens}, last filename: {file.filename}")
-
-    return patches, modified_files_list, deleted_files_list, added_files_list, total_tokens
+                get_logger().info(f"Tokens: {total_tokens}, last filename: {filename}")
+    return total_tokens, patches, remaining_files_list_new, files_in_patch_list


 async def retry_with_fallback_models(f: Callable, model_type: ModelType = ModelType.REGULAR):
@ -417,4 +466,4 @@ def get_pr_multi_diffs(git_provider: GitProvider,
        final_diff = "\n".join(patches)
        final_diff_list.append(final_diff)

-    return final_diff_list
+    return final_diff_list
--- a/pr_agent/settings/configuration.toml
+++ b/pr_agent/settings/configuration.toml
@ -74,7 +74,10 @@ inline_file_summary=false # false, true, 'table'
 # markers
 use_description_markers=false
 include_generated_by_header=true
-
+# large pr mode 💎
+enable_large_pr_handling=true
+max_ai_calls=3
+mention_extra_files=true
 #custom_labels = ['Bug fix', 'Tests', 'Bug fix with tests', 'Enhancement', 'Documentation', 'Other']

 [pr_questions] # /ask #
@ -82,7 +85,7 @@ enable_help_text=false


 [pr_code_suggestions] # /improve #
-max_context_tokens=8000
+max_context_tokens=10000
 num_code_suggestions=4
 commitable_code_suggestions = false
 extra_instructions = ""
@ -105,16 +108,34 @@ final_clip_factor = 0.8
 demand_code_suggestions_self_review=false # add a checkbox for the author to self-review the code suggestions
 code_suggestions_self_review_text= "**Author self-review**: I have reviewed the PR code suggestions, and addressed the relevant ones."
 approve_pr_on_self_review=false # Pro feature. if true, the PR will be auto-approved after the author clicks on the self-review checkbox
+# Suggestion impact
+publish_post_process_suggestion_impact=true
+
+[pr_custom_prompt] # /custom_prompt #
+prompt = """\
+The code suggestions should focus only on the following:
+- ...
+- ...
+...
+"""
+suggestions_score_threshold=0
+num_code_suggestions_per_chunk=4
+self_reflect_on_custom_suggestions=true
+enable_help_text=false
+

 [pr_add_docs] # /add_docs #
 extra_instructions = ""
-docs_style = "Sphinx Style" # "Google Style with Args, Returns, Attributes...etc", "Numpy Style", "Sphinx Style", "PEP257", "reStructuredText"
+docs_style = "Sphinx" # "Google Style with Args, Returns, Attributes...etc", "Numpy Style", "Sphinx Style", "PEP257", "reStructuredText"
+file = ""              # in case there are several components with the same name, you can specify the relevant file
+class_name = ""        # in case there are several methods with the same name in the same file, you can specify the relevant class name

 [pr_update_changelog] # /update_changelog #
 push_changelog_changes=false
 extra_instructions = ""

 [pr_analyze] # /analyze #
+enable_help_text=true

 [pr_test] # /test #
 extra_instructions = ""
@ -129,13 +150,14 @@ enable_help_text=false
 num_code_suggestions=4
 extra_instructions = ""
 file = ""              # in case there are several components with the same name, you can specify the relevant file
-class_name = ""
+class_name = ""        # in case there are several methods with the same name in the same file, you can specify the relevant class name

 [checks] # /checks (pro feature) #
 enable_auto_checks_feedback=true
 excluded_checks_list=["lint"] # list of checks to exclude, for example: ["check1", "check2"]
 persistent_comment=true
 enable_help_text=true
+final_update_message = false

 [pr_help] # /help #

@ -148,15 +170,16 @@ ratelimit_retries = 5
 base_url = "https://api.github.com"
 publish_inline_comments_fallback_with_verification = true
 try_fix_invalid_inline_comments = true
+app_name = "pr-agent"

 [github_action_config]
 # auto_review = true    # set as env var in .github/workflows/pr-agent.yaml
 # auto_describe = true  # set as env var in .github/workflows/pr-agent.yaml
 # auto_improve = true   # set as env var in .github/workflows/pr-agent.yaml
-# enable_output = true   # set as env var in .github/workflows/pr-agent.yaml

 [github_app]
 # these toggles allows running the github app from custom deployments
+bot_user = "github-actions[bot]"
 override_deployment_type = true
 # settings for "pull_request" event
 handle_pr_actions = ['opened', 'reopened', 'ready_for_review']
@ -180,7 +203,14 @@ ignore_pr_title = []
 ignore_bot_pr = true

 [gitlab]
-url = "https://gitlab.com" # URL to the gitlab service
+# URL to the gitlab service
+url = "https://gitlab.com"
+# Polling (either project id or namespace/project_name) syntax can be used
+projects_to_monitor = ['org_name/repo_name']
+# Polling trigger
+magic_word = "AutoReview"
+# Polling interval
+polling_interval_seconds = 30
 pr_commands = [
    "/describe",
    "/review --pr_reviewer.num_code_suggestions=0",
@ -229,10 +259,18 @@ force_update_dataset = false
 max_issues_to_scan = 500
 vectordb = "pinecone"

+[pr_find_similar_component]
+class_name = ""
+file = ""
+search_from_org = false
+allow_fallback_less_words = true
+number_of_keywords = 5
+number_of_results = 5
+
 [pinecone]
 # fill and place in .secrets.toml
 #api_key = ...
 # environment = "gcp-starter"

 [lancedb]
-uri = "./lancedb"
+uri = "./lancedb"
--- a/pr_agent/settings/pr_description_prompts.toml
+++ b/pr_agent/settings/pr_description_prompts.toml
@ -37,7 +37,7 @@ class PRType(str, Enum):

 {%- if enable_semantic_files_types %}

-Class FileDescription(BaseModel):
+class FileDescription(BaseModel):
    filename: str = Field(description="the relevant file full path")
    language: str = Field(description="the relevant file language")
    changes_summary: str = Field(description="concise summary of the changes in the relevant file, in bullet points (1-4 bullet points).")
@ -45,7 +45,7 @@ Class FileDescription(BaseModel):
    label: str = Field(description="a single semantic label that represents a type of code changes that occurred in the File. Possible values (partial list): 'bug fix', 'tests', 'enhancement', 'documentation', 'error handling', 'configuration changes', 'dependencies', 'formatting', 'miscellaneous', ...")
 {%- endif %}

-Class PRDescription(BaseModel):
+class PRDescription(BaseModel):
    type: List[PRType] = Field(description="one or more types that describe the PR content. Return the label member value (e.g. 'Bug fix', not 'bug_fix')")
 {%- if enable_semantic_files_types %}
    pr_files[List[FileDescription]] = Field(max_items=15, description="a list of the files in the PR, and their changes summary.")
--- a/pr_agent/tools/pr_code_suggestions.py
+++ b/pr_agent/tools/pr_code_suggestions.py
@ -35,7 +35,6 @@ class PRCodeSuggestions:
                get_logger().info(f"Setting max_model_tokens to {MAX_CONTEXT_TOKENS_IMPROVE} for PR improve")
                get_settings().config.max_model_tokens = MAX_CONTEXT_TOKENS_IMPROVE

-
        # extended mode
        try:
            self.is_extended = self._get_is_extended(args or [])
@ -116,7 +115,7 @@ class PRCodeSuggestions:

                    # require self-review
                    if get_settings().pr_code_suggestions.demand_code_suggestions_self_review:
-                        text= get_settings().pr_code_suggestions.code_suggestions_self_review_text
+                        text = get_settings().pr_code_suggestions.code_suggestions_self_review_text
                        pr_body += f"\n\n- [ ]  {text}"
                        if get_settings().pr_code_suggestions.approve_pr_on_self_review:
                            pr_body += ' <!-- approve pr self-review -->'
@ -193,8 +192,9 @@ class PRCodeSuggestions:

        # self-reflect on suggestions
        if get_settings().pr_code_suggestions.self_reflect_on_suggestions:
-            model = get_settings().config.model_turbo # use turbo model for self-reflection, since it is an easier task
-            response_reflect = await self.self_reflect_on_suggestions(data["code_suggestions"], patches_diff, model=model)
+            model = get_settings().config.model_turbo  # use turbo model for self-reflection, since it is an easier task
+            response_reflect = await self.self_reflect_on_suggestions(data["code_suggestions"], patches_diff,
+                                                                      model=model)
            if response_reflect:
                response_reflect_yaml = load_yaml(response_reflect)
                code_suggestions_feedback = response_reflect_yaml["code_suggestions"]
@ -203,7 +203,7 @@ class PRCodeSuggestions:
                        try:
                            suggestion["score"] = code_suggestions_feedback[i]["suggestion_score"]
                            suggestion["score_why"] = code_suggestions_feedback[i]["why"]
-                        except Exception as e: #
+                        except Exception as e:  #
                            get_logger().error(f"Error processing suggestion score {i}",
                                               artifact={"suggestion": suggestion,
                                                         "code_suggestions_feedback": code_suggestions_feedback[i]})
@ -226,7 +226,7 @@ class PRCodeSuggestions:
                suggestion['improved_code'] = suggestion['improved_code'][:max_code_suggestion_length]
                suggestion['improved_code'] += f"\n{suggestion_truncation_message}"
                get_logger().info(f"Truncated suggestion from {len(suggestion['improved_code'])} "
-                                      f"characters to {max_code_suggestion_length} characters")
+                                  f"characters to {max_code_suggestion_length} characters")
        return suggestion

    def _prepare_pr_code_suggestions(self, predictions: str) -> Dict:
@ -240,17 +240,24 @@ class PRCodeSuggestions:
        one_sentence_summary_list = []
        for i, suggestion in enumerate(data['code_suggestions']):
            try:
-                if (not suggestion or 'one_sentence_summary' not in suggestion or
-                        'label' not in suggestion or 'relevant_file' not in suggestion):
-                    get_logger().debug(f"Skipping suggestion {i + 1}, because it is invalid: {suggestion}")
+                needed_keys = ['one_sentence_summary', 'label', 'relevant_file', 'relevant_lines_start', 'relevant_lines_end']
+                is_valid_keys = True
+                for key in needed_keys:
+                    if key not in suggestion:
+                        is_valid_keys = False
+                        get_logger().debug(f"Skipping suggestion {i + 1}, because it does not contain '{key}':\n'{suggestion}")
+                        break
+                if not is_valid_keys:
                    continue

                if suggestion['one_sentence_summary'] in one_sentence_summary_list:
                    get_logger().debug(f"Skipping suggestion {i + 1}, because it is a duplicate: {suggestion}")
                    continue

-                if 'const' in suggestion['suggestion_content'] and 'instead' in suggestion['suggestion_content'] and 'let' in suggestion['suggestion_content']:
-                    get_logger().debug(f"Skipping suggestion {i + 1}, because it uses 'const instead let': {suggestion}")
+                if 'const' in suggestion['suggestion_content'] and 'instead' in suggestion[
+                    'suggestion_content'] and 'let' in suggestion['suggestion_content']:
+                    get_logger().debug(
+                        f"Skipping suggestion {i + 1}, because it uses 'const instead let': {suggestion}")
                    continue

                if ('existing_code' in suggestion) and ('improved_code' in suggestion):
@ -258,7 +265,7 @@ class PRCodeSuggestions:
                        get_logger().debug(
                            f"edited improved suggestion {i + 1}, because equal to existing code: {suggestion['existing_code']}")
                        if get_settings().pr_code_suggestions.commitable_code_suggestions:
-                            suggestion['improved_code'] = "" # we need 'existing_code' to locate the code in the PR
+                            suggestion['improved_code'] = ""  # we need 'existing_code' to locate the code in the PR
                        else:
                            suggestion['existing_code'] = ""
                    suggestion = self._truncate_if_needed(suggestion)
@ -279,12 +286,15 @@ class PRCodeSuggestions:
        if not data['code_suggestions']:
            get_logger().info('No suggestions found to improve this PR.')
            if self.progress_response:
-                return self.git_provider.edit_comment(self.progress_response, body='No suggestions found to improve this PR.')
+                return self.git_provider.edit_comment(self.progress_response,
+                                                      body='No suggestions found to improve this PR.')
            else:
                return self.git_provider.publish_comment('No suggestions found to improve this PR.')

        for d in data['code_suggestions']:
            try:
+                if get_settings().config.verbosity_level >= 2:
+                    get_logger().info(f"suggestion: {d}")
                relevant_file = d['relevant_file'].strip()
                relevant_lines_start = int(d['relevant_lines_start'])  # absolute position
                relevant_lines_end = int(d['relevant_lines_end'])
@ -300,8 +310,8 @@ class PRCodeSuggestions:
                else:
                    body = f"**Suggestion:** {content} [{label}]\n```suggestion\n" + new_code_snippet + "\n```"
                code_suggestions.append({'body': body, 'relevant_file': relevant_file,
-                                             'relevant_lines_start': relevant_lines_start,
-                                             'relevant_lines_end': relevant_lines_end})
+                                         'relevant_lines_start': relevant_lines_start,
+                                         'relevant_lines_end': relevant_lines_end})
            except Exception:
                get_logger().info(f"Could not parse suggestion: {d}")

@ -477,14 +487,15 @@ class PRCodeSuggestions:

            # sort suggestions_labels by the suggestion with the highest score
            if get_settings().pr_code_suggestions.self_reflect_on_suggestions:
-                suggestions_labels = dict(sorted(suggestions_labels.items(), key=lambda x: max([s['score'] for s in x[1]]), reverse=True))
+                suggestions_labels = dict(
+                    sorted(suggestions_labels.items(), key=lambda x: max([s['score'] for s in x[1]]), reverse=True))
                # sort the suggestions inside each label group by score
                for label, suggestions in suggestions_labels.items():
                    suggestions_labels[label] = sorted(suggestions, key=lambda x: x['score'], reverse=True)

-
+            counter_suggestions = 0
            for label, suggestions in suggestions_labels.items():
-                num_suggestions=len(suggestions)
+                num_suggestions = len(suggestions)
                pr_body += f"""<tr><td rowspan={num_suggestions}><strong>{label.capitalize()}</strong></td>\n"""
                for i, suggestion in enumerate(suggestions):

@ -508,8 +519,8 @@ class PRCodeSuggestions:

                    suggestion_content = insert_br_after_x_chars(suggestion_content, 90)
                    # pr_body += f"<tr><td><details><summary>{suggestion_content}</summary>"
-                    existing_code = suggestion['existing_code'].rstrip()+"\n"
-                    improved_code = suggestion['improved_code'].rstrip()+"\n"
+                    existing_code = suggestion['existing_code'].rstrip() + "\n"
+                    improved_code = suggestion['improved_code'].rstrip() + "\n"

                    diff = difflib.unified_diff(existing_code.split('\n'),
                                                improved_code.split('\n'), n=999)
@ -518,7 +529,7 @@ class PRCodeSuggestions:

                    example_code = ""
                    example_code += f"```diff\n{patch}\n```\n"
-                    if i==0:
+                    if i == 0:
                        pr_body += f"""<td>\n\n"""
                    else:
                        pr_body += f"""<tr><td>\n\n"""
@ -529,13 +540,20 @@ class PRCodeSuggestions:
                    pr_body += f"""\n\n<details><summary>{suggestion_summary}</summary>\n\n___\n\n"""
                    pr_body += f"""
 **{suggestion_content}**
-    
+
 [{relevant_file} {range_str}]({code_snippet_link})

-{example_code}                   
+{example_code.rstrip()}                   
 """
+                    if (get_settings().pr_code_suggestions.apply_suggestions_checkbox and
+                            (isinstance(self.git_provider, GithubProvider) or isinstance(self.git_provider,
+                                                                                         GitLabProvider))):
+                        # add a checkbox line, to create a committal suggestion from the table suggestion
+                        if '...' not in patch:
+                            pr_body += f"""\n- [ ] **Apply this suggestion** <!-- /improve --apply_suggestion={counter_suggestions} -->\n\n"""
+
                    if get_settings().pr_code_suggestions.self_reflect_on_suggestions:
-                        pr_body +=f"\n\n<details><summary><b>Suggestion importance[1-10]: {suggestion['score']}</b></summary>\n\n"
+                        pr_body += f"<details><summary>Suggestion importance[1-10]: {suggestion['score']}</summary>\n\n"
                        pr_body += f"Why: {suggestion['score_why']}\n\n"
                        pr_body += f"</details>"

@ -546,7 +564,7 @@ class PRCodeSuggestions:
                        pr_body += f"</td><td align=center>{suggestion['score']}\n\n"

                    pr_body += f"</td></tr>"
-
+                    counter_suggestions += 1

                # pr_body += "</details>"
                # pr_body += """</td></tr>"""
@ -570,14 +588,54 @@ class PRCodeSuggestions:
                         "diff": patches_diff,
                         'num_code_suggestions': len(suggestion_list)}
            environment = Environment(undefined=StrictUndefined)
-            system_prompt_reflect = environment.from_string(get_settings().pr_code_suggestions_reflect_prompt.system).render(
+            system_prompt_reflect = environment.from_string(
+                get_settings().pr_code_suggestions_reflect_prompt.system).render(
                variables)
-            user_prompt_reflect = environment.from_string(get_settings().pr_code_suggestions_reflect_prompt.user).render(variables)
+            user_prompt_reflect = environment.from_string(
+                get_settings().pr_code_suggestions_reflect_prompt.user).render(variables)
            with get_logger().contextualize(command="self_reflect_on_suggestions"):
                response_reflect, finish_reason_reflect = await self.ai_handler.chat_completion(model=model,
-                                                                                system=system_prompt_reflect,
-                                                                                user=user_prompt_reflect)
+                                                                                                system=system_prompt_reflect,
+                                                                                                user=user_prompt_reflect)
        except Exception as e:
            get_logger().info(f"Could not reflect on suggestions, error: {e}")
            return ""
-        return response_reflect
+        return response_reflect
+
+    async def handle_apply_suggestion(self):
+        try:
+            get_logger().info('Processing "apply" suggestion...')
+            suggestion_number = get_settings().apply_suggestion
+            comment_after = get_settings().pr_code_suggestions.get('comment_after', None)
+            if suggestion_number is None or comment_after is None:
+                get_logger().error('Invalid suggestion number or comment_after')
+                return False
+            suggestions = parse_suggestions_content(comment_after)
+            if not suggestions:
+                get_logger().error('Failed to parse suggestions')
+                return False
+            suggestion = suggestions[suggestion_number]
+            if hasattr(self, 'main_language'):
+                self.git_provider.main_language = self.main_language
+            relevant_file = suggestion['suggestion_orig_location']['filename']
+            relevant_lines_start = int(suggestion['suggestion_orig_location']['start_line'])
+            relevant_lines_end = int(suggestion['suggestion_orig_location']['end_line'])
+            content = suggestion['suggestion_summary']
+            new_code_snippet = suggestion['new_code_snippet']
+            label = suggestion['category']
+            score = suggestion['score']
+            if new_code_snippet:
+                new_code_snippet = self.dedent_code(relevant_file, relevant_lines_start, new_code_snippet)
+            body = f"**Suggestion:** {content} [{label}, importance: {score}]\n```suggestion\n" + new_code_snippet + "\n```"
+            original_suggestion = suggestion
+            code_suggestions = [({'original_suggestion': original_suggestion,
+                                  'body': body, 'relevant_file': relevant_file,
+                                  'relevant_lines_start': relevant_lines_start,
+                                  'relevant_lines_end': relevant_lines_end})]
+            is_successful = self.git_provider.publish_code_suggestions(code_suggestions)
+            get_settings().set("suggestion_score", score)
+            get_settings().set("suggestion_label", label)
+        except Exception as e:
+            get_logger().info(f"Failed to apply suggestion, error: {e}")
+            is_successful = False
+        return is_successful
--- a/pr_agent/tools/pr_description.py
+++ b/pr_agent/tools/pr_description.py
@ -7,11 +7,14 @@ from jinja2 import Environment, StrictUndefined

 from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler
 from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler
-from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models
+from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, get_pr_diff_multiple_patchs, \
+    OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD
 from pr_agent.algo.token_handler import TokenHandler
-from pr_agent.algo.utils import load_yaml, set_custom_labels, get_user_labels, ModelType, show_relevant_configurations
+from pr_agent.algo.utils import set_custom_labels
+from pr_agent.algo.utils import load_yaml, get_user_labels, ModelType, show_relevant_configurations, get_max_tokens, \
+    clip_tokens
 from pr_agent.config_loader import get_settings
-from pr_agent.git_providers import get_git_provider, get_git_provider_with_context
+from pr_agent.git_providers import get_git_provider, GithubProvider, get_git_provider_with_context
 from pr_agent.git_providers.git_provider import get_main_pr_language
 from pr_agent.log import get_logger
 from pr_agent.servers.help import HelpMessage
@ -56,6 +59,7 @@ class PRDescription:
            "custom_labels_class": "",  # will be filled if necessary in 'set_custom_labels' function
            "enable_semantic_files_types": get_settings().pr_description.enable_semantic_files_types,
        }
+
        self.user_description = self.git_provider.get_user_description()

        # Initialize the token handler
@ -163,32 +167,105 @@ class PRDescription:
        if get_settings().pr_description.use_description_markers and 'pr_agent:' not in self.user_description:
            return None

-        self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model)
-        if self.patches_diff:
-            get_logger().debug(f"PR diff", artifact=self.patches_diff)
-            self.prediction = await self._get_prediction(model)
+        large_pr_handling = get_settings().pr_description.enable_large_pr_handling and "pr_description_only_files_prompts" in get_settings()
+        patches_diff = get_pr_diff(self.git_provider, self.token_handler, model, large_pr_handling=large_pr_handling)
+        if not large_pr_handling or patches_diff:
+            self.patches_diff = patches_diff
+            if patches_diff:
+                get_logger().debug(f"PR diff", artifact=self.patches_diff)
+                self.prediction = await self._get_prediction(model, patches_diff, prompt="pr_description_prompt")
+            else:
+                get_logger().error(f"Error getting PR diff {self.pr_id}")
+                self.prediction = None
        else:
-            get_logger().error(f"Error getting PR diff {self.pr_id}")
-            self.prediction = None
+            # get the diff in multiple patches, with the token handler only for the files prompt
+            get_logger().debug('large_pr_handling for describe')
+            token_handler_only_files_prompt = TokenHandler(
+                self.git_provider.pr,
+                self.vars,
+                get_settings().pr_description_only_files_prompts.system,
+                get_settings().pr_description_only_files_prompts.user,
+            )
+            (patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict,
+             files_in_patches_list) = get_pr_diff_multiple_patchs(
+                self.git_provider, token_handler_only_files_prompt, model)

-    async def _get_prediction(self, model: str) -> str:
-        """
-        Generate an AI prediction for the PR description based on the provided model.
+            # get the files prediction for each patch
+            file_description_str_list = []
+            for i, patches in enumerate(patches_compressed_list):
+                patches_diff = "\n".join(patches)
+                get_logger().debug(f"PR diff number {i + 1} for describe files")
+                prediction_files = await self._get_prediction(model, patches_diff,
+                                                              prompt="pr_description_only_files_prompts")
+                prediction_files = prediction_files.strip().removeprefix('```yaml').strip('`').strip()
+                if load_yaml(prediction_files) and prediction_files.startswith('pr_files'):
+                    prediction_files = prediction_files.removeprefix('pr_files:').strip()
+                    file_description_str_list.append(prediction_files)
+                else:
+                    get_logger().debug(f"failed to generate predictions in iteration {i + 1} for describe files")

-        Args:
-            model (str): The name of the model to be used for generating the prediction.
+            # generate files_walkthrough string, with proper token handling
+            token_handler_only_description_prompt = TokenHandler(
+                self.git_provider.pr,
+                self.vars,
+                get_settings().pr_description_only_description_prompts.system,
+                get_settings().pr_description_only_description_prompts.user)
+            files_walkthrough = "\n".join(file_description_str_list)
+            if remaining_files_list:
+                files_walkthrough += "\n\nNo more token budget. Additional unprocessed files:"
+                for file in remaining_files_list:
+                    files_walkthrough += f"\n- {file}"
+            if deleted_files_list:
+                files_walkthrough += "\n\nAdditional deleted files:"
+                for file in deleted_files_list:
+                    files_walkthrough += f"\n- {file}"
+            tokens_files_walkthrough = len(token_handler_only_description_prompt.encoder.encode(files_walkthrough))
+            total_tokens = token_handler_only_description_prompt.prompt_tokens + tokens_files_walkthrough
+            max_tokens_model = get_max_tokens(model)
+            if total_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD:
+                # clip files_walkthrough to git the tokens within the limit
+                files_walkthrough = clip_tokens(files_walkthrough,
+                                                max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD - token_handler_only_description_prompt.prompt_tokens,
+                                                num_input_tokens=tokens_files_walkthrough)

-        Returns:
-            str: The generated AI prediction.
-        """
+            # PR header inference
+            # toDo - add deleted and unprocessed files to the prompt ('files_walkthrough'), as extra data
+            get_logger().debug(f"PR diff only description", artifact=files_walkthrough)
+            prediction_headers = await self._get_prediction(model, patches_diff=files_walkthrough,
+                                                            prompt="pr_description_only_description_prompts")
+            prediction_headers = prediction_headers.strip().removeprefix('```yaml').strip('`').strip()
+            if get_settings().pr_description.mention_extra_files:
+                for file in remaining_files_list:
+                    extra_file_yaml = f"""\
+- filename: |
+    {file}
+  changes_summary: |
+    ...
+  changes_title: |
+    ...
+  label: |
+    not processed (token-limit)
+"""
+                    files_walkthrough = files_walkthrough.strip() + "\n" + extra_file_yaml.strip()
+
+            # final processing
+            self.prediction = prediction_headers + "\n" + "pr_files:\n" + files_walkthrough
+            if not load_yaml(self.prediction):
+                get_logger().error(f"Error getting valid YAML in large PR handling for describe {self.pr_id}")
+                if load_yaml(prediction_headers):
+                    get_logger().debug(f"Using only headers for describe {self.pr_id}")
+                    self.prediction = prediction_headers
+
+    async def _get_prediction(self, model: str, patches_diff: str, prompt="pr_description_prompt") -> str:
        variables = copy.deepcopy(self.vars)
-        variables["diff"] = self.patches_diff  # update diff
+        variables["diff"] = patches_diff  # update diff

        environment = Environment(undefined=StrictUndefined)
        set_custom_labels(variables, self.git_provider)
        self.variables = variables
-        system_prompt = environment.from_string(get_settings().pr_description_prompt.system).render(variables)
-        user_prompt = environment.from_string(get_settings().pr_description_prompt.user).render(variables)
+
+        system_prompt = environment.from_string(get_settings().get(prompt, {}).get("system", "")).render(variables)
+        user_prompt = environment.from_string(get_settings().get(prompt, {}).get("user", "")).render(variables)

        response, finish_reason = await self.ai_handler.chat_completion(
            model=model,
@ -351,7 +428,7 @@ class PRDescription:
                filename = file['filename'].replace("'", "`").replace('"', '`')
                changes_summary = file['changes_summary']
                changes_title = file['changes_title'].strip()
-                label = file.get('label')
+                label = file.get('label').strip().lower()
                if label not in file_label_dict:
                    file_label_dict[label] = []
                file_label_dict[label].append((filename, changes_title, changes_summary))
@ -392,6 +469,7 @@ class PRDescription:
                for filename, file_changes_title, file_change_description in list_tuples:
                    filename = filename.replace("'", "`").rstrip()
                    filename_publish = filename.split("/")[-1]
+
                    file_changes_title_code = f"<code>{file_changes_title}</code>"
                    file_changes_title_code_br = insert_br_after_x_chars(file_changes_title_code, x=(delta - 5)).strip()
                    if len(file_changes_title_code_br) < (delta - 5):
@ -423,14 +501,16 @@ class PRDescription:
 <hr>

 {filename}
+
 {file_change_description_br}


 </details>
-    
+

  </td>
  <td><a href="{link}">{diff_plus_minus}</a>{delta_nbsp}</td>
+
 </tr>                    
 """
                if use_collapsible_file_list: