diff --git a/docs/docs/tools/describe.md b/docs/docs/tools/describe.md index f5357dee..9fe50e66 100644 --- a/docs/docs/tools/describe.md +++ b/docs/docs/tools/describe.md @@ -87,6 +87,10 @@ publish_labels = ... collapsible_file_list If set to true, the file list in the "Changes walkthrough" section will be collapsible. If set to "adaptive", the file list will be collapsible only if there are more than 8 files. Default is "adaptive". + + enable_large_pr_handling + Pro feature. If set to true, in case of a large PR the tool will make several calls to the AI and combine them to be able to cover more files. Default is true. + enable_help_text If set to true, the tool will display a help text in the comment. Default is false. diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 731af5a2..818e98ff 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -24,26 +24,10 @@ ADDED_FILES_ = "Additional added files (insufficient token budget to process):\n OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD = 1000 OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600 + + def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: str, - add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False) -> str: - """ - Returns a string with the diff of the pull request, applying diff minimization techniques if needed. - - Args: - git_provider (GitProvider): An object of the GitProvider class representing the Git provider used for the pull - request. - token_handler (TokenHandler): An object of the TokenHandler class used for handling tokens in the context of the - pull request. - model (str): The name of the model used for tokenization. - add_line_numbers_to_hunks (bool, optional): A boolean indicating whether to add line numbers to the hunks in the - diff. Defaults to False. - disable_extra_lines (bool, optional): A boolean indicating whether to disable the extension of each patch with - extra lines of context. Defaults to False. - - Returns: - str: A string with the diff of the pull request, applying diff minimization techniques if needed. - """ - + add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False, large_pr_handling=False) -> str: if disable_extra_lines: PATCH_EXTRA_LINES = 0 else: @@ -87,39 +71,99 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s # if we are over the limit, start pruning get_logger().info(f"Tokens: {total_tokens}, total tokens over limit: {get_max_tokens(model)}, " f"pruning diff.") - patches_compressed, modified_file_names, deleted_file_names, added_file_names, total_tokens_new = \ + patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list = \ pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks) + if large_pr_handling and len(patches_compressed_list) > 1: + get_logger().info(f"Large PR handling mode, and found {len(patches_compressed_list)} patches with original diff.") + return "" # return empty string, as we generate multiple patches with a different prompt + + # return the first patch + patches_compressed = patches_compressed_list[0] + total_tokens_new = total_tokens_list[0] + files_in_patch = files_in_patches_list[0] + # Insert additional information about added, modified, and deleted files if there is enough space max_tokens = get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD curr_token = total_tokens_new # == token_handler.count_tokens(final_diff)+token_handler.prompt_tokens final_diff = "\n".join(patches_compressed) delta_tokens = 10 - if added_file_names and (max_tokens - curr_token) > delta_tokens: - added_list_str = ADDED_FILES_ + "\n".join(added_file_names) - added_list_str = clip_tokens(added_list_str, max_tokens - curr_token) - if added_list_str: - final_diff = final_diff + "\n\n" + added_list_str - curr_token += token_handler.count_tokens(added_list_str) + 2 - if modified_file_names and (max_tokens - curr_token) > delta_tokens: - modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names) - modified_list_str = clip_tokens(modified_list_str, max_tokens - curr_token) - if modified_list_str: - final_diff = final_diff + "\n\n" + modified_list_str - curr_token += token_handler.count_tokens(modified_list_str) + 2 - if deleted_file_names and (max_tokens - curr_token) > delta_tokens: - deleted_list_str = DELETED_FILES_ + "\n".join(deleted_file_names) - deleted_list_str = clip_tokens(deleted_list_str, max_tokens - curr_token) - if deleted_list_str: - final_diff = final_diff + "\n\n" + deleted_list_str - try: - get_logger().debug(f"After pruning, added_list_str: {added_list_str}, modified_list_str: {modified_list_str}, " - f"deleted_list_str: {deleted_list_str}") - except Exception as e: - pass + added_list_str = modified_list_str = deleted_list_str = "" + unprocessed_files = [] + # generate the added, modified, and deleted files lists + if (max_tokens - curr_token) > delta_tokens: + for filename, file_values in file_dict.items(): + if filename in files_in_patch: + continue + if file_values['edit_type'] == EDIT_TYPE.ADDED: + unprocessed_files.append(filename) + if not added_list_str: + added_list_str = ADDED_FILES_ + f"\n{filename}" + else: + added_list_str = added_list_str + f"\n{filename}" + elif file_values['edit_type'] == EDIT_TYPE.MODIFIED or EDIT_TYPE.RENAMED: + unprocessed_files.append(filename) + if not modified_list_str: + modified_list_str = MORE_MODIFIED_FILES_ + f"\n{filename}" + else: + modified_list_str = modified_list_str + f"\n{filename}" + elif file_values['edit_type'] == EDIT_TYPE.DELETED: + # unprocessed_files.append(filename) # not needed here, because the file was deleted, so no need to process it + if not deleted_list_str: + deleted_list_str = DELETED_FILES_ + f"\n{filename}" + else: + deleted_list_str = deleted_list_str + f"\n{filename}" + + # prune the added, modified, and deleted files lists, and add them to the final diff + added_list_str = clip_tokens(added_list_str, max_tokens - curr_token) + if added_list_str: + final_diff = final_diff + "\n\n" + added_list_str + curr_token += token_handler.count_tokens(added_list_str) + 2 + modified_list_str = clip_tokens(modified_list_str, max_tokens - curr_token) + if modified_list_str: + final_diff = final_diff + "\n\n" + modified_list_str + curr_token += token_handler.count_tokens(modified_list_str) + 2 + deleted_list_str = clip_tokens(deleted_list_str, max_tokens - curr_token) + if deleted_list_str: + final_diff = final_diff + "\n\n" + deleted_list_str + + get_logger().debug(f"After pruning, added_list_str: {added_list_str}, modified_list_str: {modified_list_str}, " + f"deleted_list_str: {deleted_list_str}") return final_diff +def get_pr_diff_multiple_patchs(git_provider: GitProvider, token_handler: TokenHandler, model: str, + add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False): + try: + diff_files_original = git_provider.get_diff_files() + except RateLimitExceededException as e: + get_logger().error(f"Rate limit exceeded for git provider API. original message {e}") + raise + + diff_files = filter_ignored(diff_files_original) + if diff_files != diff_files_original: + try: + get_logger().info(f"Filtered out {len(diff_files_original) - len(diff_files)} files") + new_names = set([a.filename for a in diff_files]) + orig_names = set([a.filename for a in diff_files_original]) + get_logger().info(f"Filtered out files: {orig_names - new_names}") + except Exception as e: + pass + + # get pr languages + pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) + if pr_languages: + try: + get_logger().info(f"PR main language: {pr_languages[0]['language']}") + except Exception as e: + pass + + patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list = \ + pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks) + + return patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list + + def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler, add_line_numbers_to_hunks: bool, @@ -164,41 +208,16 @@ def pr_generate_extended_diff(pr_languages: list, def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, model: str, - convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list, list, int]: - """ - Generate a compressed diff string for a pull request, using diff minimization techniques to reduce the number of - tokens used. - Args: - top_langs (list): A list of dictionaries representing the languages used in the pull request and their - corresponding files. - token_handler (TokenHandler): An object of the TokenHandler class used for handling tokens in the context of the - pull request. - model (str): The model used for tokenization. - convert_hunks_to_line_numbers (bool): A boolean indicating whether to convert hunks to line numbers in the diff. - Returns: - Tuple[list, list, list]: A tuple containing the following lists: - - patches: A list of compressed diff patches for each file in the pull request. - - modified_files_list: A list of file names that were skipped due to large patch size. - - deleted_files_list: A list of file names that were deleted in the pull request. - - Minimization techniques to reduce the number of tokens: - 0. Start from the largest diff patch to smaller ones - 1. Don't use extend context lines around diff - 2. Minimize deleted files - 3. Minimize deleted hunks - 4. Minimize all remaining files when you reach token limit - """ - - patches = [] - added_files_list = [] - modified_files_list = [] + convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list, list, dict, list]: deleted_files_list = [] + # sort each one of the languages in top_langs by the number of tokens in the diff sorted_files = [] for lang in top_langs: sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True)) - total_tokens = token_handler.prompt_tokens + # generate patches for each file, and count tokens + file_dict = {} for file in sorted_files: original_file_content_str = file.base_file new_file_content_str = file.head_file @@ -210,55 +229,85 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo patch = handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file.filename, file.edit_type) if patch is None: - # if not deleted_files_list: - # total_tokens += token_handler.count_tokens(DELETED_FILES_) if file.filename not in deleted_files_list: deleted_files_list.append(file.filename) - # total_tokens += token_handler.count_tokens(file.filename) + 1 continue if convert_hunks_to_line_numbers: patch = convert_to_hunks_with_lines_numbers(patch, file) new_patch_tokens = token_handler.count_tokens(patch) + file_dict[file.filename] = {'patch': patch, 'tokens': new_patch_tokens, 'edit_type': file.edit_type} + + max_tokens_model = get_max_tokens(model) + + # first iteration + files_in_patches_list = [] + remaining_files_list = [file.filename for file in sorted_files] + patches_list =[] + total_tokens_list = [] + total_tokens, patches, remaining_files_list, files_in_patch_list = generate_full_patch(convert_hunks_to_line_numbers, file_dict, + max_tokens_model, remaining_files_list, token_handler) + patches_list.append(patches) + total_tokens_list.append(total_tokens) + files_in_patches_list.append(files_in_patch_list) + + # additional iterations (if needed) + NUMBER_OF_ALLOWED_ITERATIONS = get_settings().pr_description.max_ai_calls - 1 # one more call is to summarize + for i in range(NUMBER_OF_ALLOWED_ITERATIONS-1): + if remaining_files_list: + total_tokens, patches, remaining_files_list, files_in_patch_list = generate_full_patch(convert_hunks_to_line_numbers, + file_dict, + max_tokens_model, + remaining_files_list, token_handler) + patches_list.append(patches) + total_tokens_list.append(total_tokens) + files_in_patches_list.append(files_in_patch_list) + else: + break + + return patches_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list + + +def generate_full_patch(convert_hunks_to_line_numbers, file_dict, max_tokens_model,remaining_files_list_prev, token_handler): + total_tokens = token_handler.prompt_tokens # initial tokens + patches = [] + remaining_files_list_new = [] + files_in_patch_list = [] + for filename, data in file_dict.items(): + if filename not in remaining_files_list_prev: + continue + + patch = data['patch'] + new_patch_tokens = data['tokens'] + edit_type = data['edit_type'] # Hard Stop, no more tokens - if total_tokens > get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD: - get_logger().warning(f"File was fully skipped, no more tokens: {file.filename}.") + if total_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD: + get_logger().warning(f"File was fully skipped, no more tokens: {filename}.") continue # If the patch is too large, just show the file name - if total_tokens + new_patch_tokens > get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD: + if total_tokens + new_patch_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD: # Current logic is to skip the patch if it's too large # TODO: Option for alternative logic to remove hunks from the patch to reduce the number of tokens # until we meet the requirements if get_settings().config.verbosity_level >= 2: - get_logger().warning(f"Patch too large, minimizing it, {file.filename}") - if file.edit_type == EDIT_TYPE.ADDED: - # if not added_files_list: - # total_tokens += token_handler.count_tokens(ADDED_FILES_) - if file.filename not in added_files_list: - added_files_list.append(file.filename) - # total_tokens += token_handler.count_tokens(file.filename) + 1 - else: - # if not modified_files_list: - # total_tokens += token_handler.count_tokens(MORE_MODIFIED_FILES_) - if file.filename not in modified_files_list: - modified_files_list.append(file.filename) - # total_tokens += token_handler.count_tokens(file.filename) + 1 + get_logger().warning(f"Patch too large, skipping it, {filename}") + remaining_files_list_new.append(filename) continue if patch: if not convert_hunks_to_line_numbers: - patch_final = f"\n\n## file: '{file.filename.strip()}\n\n{patch.strip()}\n'" + patch_final = f"\n\n## file: '{filename.strip()}\n\n{patch.strip()}\n'" else: patch_final = "\n\n" + patch.strip() patches.append(patch_final) total_tokens += token_handler.count_tokens(patch_final) + files_in_patch_list.append(filename) if get_settings().config.verbosity_level >= 2: - get_logger().info(f"Tokens: {total_tokens}, last filename: {file.filename}") - - return patches, modified_files_list, deleted_files_list, added_files_list, total_tokens + get_logger().info(f"Tokens: {total_tokens}, last filename: {filename}") + return total_tokens, patches, remaining_files_list_new, files_in_patch_list async def retry_with_fallback_models(f: Callable, model_type: ModelType = ModelType.REGULAR): @@ -417,4 +466,4 @@ def get_pr_multi_diffs(git_provider: GitProvider, final_diff = "\n".join(patches) final_diff_list.append(final_diff) - return final_diff_list \ No newline at end of file + return final_diff_list diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index 2154ec5e..76521939 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -74,7 +74,10 @@ inline_file_summary=false # false, true, 'table' # markers use_description_markers=false include_generated_by_header=true - +# large pr mode 💎 +enable_large_pr_handling=true +max_ai_calls=3 +mention_extra_files=true #custom_labels = ['Bug fix', 'Tests', 'Bug fix with tests', 'Enhancement', 'Documentation', 'Other'] [pr_questions] # /ask # @@ -82,7 +85,7 @@ enable_help_text=false [pr_code_suggestions] # /improve # -max_context_tokens=8000 +max_context_tokens=10000 num_code_suggestions=4 commitable_code_suggestions = false extra_instructions = "" @@ -105,16 +108,34 @@ final_clip_factor = 0.8 demand_code_suggestions_self_review=false # add a checkbox for the author to self-review the code suggestions code_suggestions_self_review_text= "**Author self-review**: I have reviewed the PR code suggestions, and addressed the relevant ones." approve_pr_on_self_review=false # Pro feature. if true, the PR will be auto-approved after the author clicks on the self-review checkbox +# Suggestion impact +publish_post_process_suggestion_impact=true + +[pr_custom_prompt] # /custom_prompt # +prompt = """\ +The code suggestions should focus only on the following: +- ... +- ... +... +""" +suggestions_score_threshold=0 +num_code_suggestions_per_chunk=4 +self_reflect_on_custom_suggestions=true +enable_help_text=false + [pr_add_docs] # /add_docs # extra_instructions = "" -docs_style = "Sphinx Style" # "Google Style with Args, Returns, Attributes...etc", "Numpy Style", "Sphinx Style", "PEP257", "reStructuredText" +docs_style = "Sphinx" # "Google Style with Args, Returns, Attributes...etc", "Numpy Style", "Sphinx Style", "PEP257", "reStructuredText" +file = "" # in case there are several components with the same name, you can specify the relevant file +class_name = "" # in case there are several methods with the same name in the same file, you can specify the relevant class name [pr_update_changelog] # /update_changelog # push_changelog_changes=false extra_instructions = "" [pr_analyze] # /analyze # +enable_help_text=true [pr_test] # /test # extra_instructions = "" @@ -129,13 +150,14 @@ enable_help_text=false num_code_suggestions=4 extra_instructions = "" file = "" # in case there are several components with the same name, you can specify the relevant file -class_name = "" +class_name = "" # in case there are several methods with the same name in the same file, you can specify the relevant class name [checks] # /checks (pro feature) # enable_auto_checks_feedback=true excluded_checks_list=["lint"] # list of checks to exclude, for example: ["check1", "check2"] persistent_comment=true enable_help_text=true +final_update_message = false [pr_help] # /help # @@ -148,15 +170,16 @@ ratelimit_retries = 5 base_url = "https://api.github.com" publish_inline_comments_fallback_with_verification = true try_fix_invalid_inline_comments = true +app_name = "pr-agent" [github_action_config] # auto_review = true # set as env var in .github/workflows/pr-agent.yaml # auto_describe = true # set as env var in .github/workflows/pr-agent.yaml # auto_improve = true # set as env var in .github/workflows/pr-agent.yaml -# enable_output = true # set as env var in .github/workflows/pr-agent.yaml [github_app] # these toggles allows running the github app from custom deployments +bot_user = "github-actions[bot]" override_deployment_type = true # settings for "pull_request" event handle_pr_actions = ['opened', 'reopened', 'ready_for_review'] @@ -180,7 +203,14 @@ ignore_pr_title = [] ignore_bot_pr = true [gitlab] -url = "https://gitlab.com" # URL to the gitlab service +# URL to the gitlab service +url = "https://gitlab.com" +# Polling (either project id or namespace/project_name) syntax can be used +projects_to_monitor = ['org_name/repo_name'] +# Polling trigger +magic_word = "AutoReview" +# Polling interval +polling_interval_seconds = 30 pr_commands = [ "/describe", "/review --pr_reviewer.num_code_suggestions=0", @@ -229,10 +259,18 @@ force_update_dataset = false max_issues_to_scan = 500 vectordb = "pinecone" +[pr_find_similar_component] +class_name = "" +file = "" +search_from_org = false +allow_fallback_less_words = true +number_of_keywords = 5 +number_of_results = 5 + [pinecone] # fill and place in .secrets.toml #api_key = ... # environment = "gcp-starter" [lancedb] -uri = "./lancedb" \ No newline at end of file +uri = "./lancedb" diff --git a/pr_agent/settings/pr_description_prompts.toml b/pr_agent/settings/pr_description_prompts.toml index 80df6d60..9c58eb3d 100644 --- a/pr_agent/settings/pr_description_prompts.toml +++ b/pr_agent/settings/pr_description_prompts.toml @@ -37,7 +37,7 @@ class PRType(str, Enum): {%- if enable_semantic_files_types %} -Class FileDescription(BaseModel): +class FileDescription(BaseModel): filename: str = Field(description="the relevant file full path") language: str = Field(description="the relevant file language") changes_summary: str = Field(description="concise summary of the changes in the relevant file, in bullet points (1-4 bullet points).") @@ -45,7 +45,7 @@ Class FileDescription(BaseModel): label: str = Field(description="a single semantic label that represents a type of code changes that occurred in the File. Possible values (partial list): 'bug fix', 'tests', 'enhancement', 'documentation', 'error handling', 'configuration changes', 'dependencies', 'formatting', 'miscellaneous', ...") {%- endif %} -Class PRDescription(BaseModel): +class PRDescription(BaseModel): type: List[PRType] = Field(description="one or more types that describe the PR content. Return the label member value (e.g. 'Bug fix', not 'bug_fix')") {%- if enable_semantic_files_types %} pr_files[List[FileDescription]] = Field(max_items=15, description="a list of the files in the PR, and their changes summary.") diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py index 777ffd99..d08c723b 100644 --- a/pr_agent/tools/pr_code_suggestions.py +++ b/pr_agent/tools/pr_code_suggestions.py @@ -35,7 +35,6 @@ class PRCodeSuggestions: get_logger().info(f"Setting max_model_tokens to {MAX_CONTEXT_TOKENS_IMPROVE} for PR improve") get_settings().config.max_model_tokens = MAX_CONTEXT_TOKENS_IMPROVE - # extended mode try: self.is_extended = self._get_is_extended(args or []) @@ -116,7 +115,7 @@ class PRCodeSuggestions: # require self-review if get_settings().pr_code_suggestions.demand_code_suggestions_self_review: - text= get_settings().pr_code_suggestions.code_suggestions_self_review_text + text = get_settings().pr_code_suggestions.code_suggestions_self_review_text pr_body += f"\n\n- [ ] {text}" if get_settings().pr_code_suggestions.approve_pr_on_self_review: pr_body += ' ' @@ -193,8 +192,9 @@ class PRCodeSuggestions: # self-reflect on suggestions if get_settings().pr_code_suggestions.self_reflect_on_suggestions: - model = get_settings().config.model_turbo # use turbo model for self-reflection, since it is an easier task - response_reflect = await self.self_reflect_on_suggestions(data["code_suggestions"], patches_diff, model=model) + model = get_settings().config.model_turbo # use turbo model for self-reflection, since it is an easier task + response_reflect = await self.self_reflect_on_suggestions(data["code_suggestions"], patches_diff, + model=model) if response_reflect: response_reflect_yaml = load_yaml(response_reflect) code_suggestions_feedback = response_reflect_yaml["code_suggestions"] @@ -203,7 +203,7 @@ class PRCodeSuggestions: try: suggestion["score"] = code_suggestions_feedback[i]["suggestion_score"] suggestion["score_why"] = code_suggestions_feedback[i]["why"] - except Exception as e: # + except Exception as e: # get_logger().error(f"Error processing suggestion score {i}", artifact={"suggestion": suggestion, "code_suggestions_feedback": code_suggestions_feedback[i]}) @@ -226,7 +226,7 @@ class PRCodeSuggestions: suggestion['improved_code'] = suggestion['improved_code'][:max_code_suggestion_length] suggestion['improved_code'] += f"\n{suggestion_truncation_message}" get_logger().info(f"Truncated suggestion from {len(suggestion['improved_code'])} " - f"characters to {max_code_suggestion_length} characters") + f"characters to {max_code_suggestion_length} characters") return suggestion def _prepare_pr_code_suggestions(self, predictions: str) -> Dict: @@ -240,17 +240,24 @@ class PRCodeSuggestions: one_sentence_summary_list = [] for i, suggestion in enumerate(data['code_suggestions']): try: - if (not suggestion or 'one_sentence_summary' not in suggestion or - 'label' not in suggestion or 'relevant_file' not in suggestion): - get_logger().debug(f"Skipping suggestion {i + 1}, because it is invalid: {suggestion}") + needed_keys = ['one_sentence_summary', 'label', 'relevant_file', 'relevant_lines_start', 'relevant_lines_end'] + is_valid_keys = True + for key in needed_keys: + if key not in suggestion: + is_valid_keys = False + get_logger().debug(f"Skipping suggestion {i + 1}, because it does not contain '{key}':\n'{suggestion}") + break + if not is_valid_keys: continue if suggestion['one_sentence_summary'] in one_sentence_summary_list: get_logger().debug(f"Skipping suggestion {i + 1}, because it is a duplicate: {suggestion}") continue - if 'const' in suggestion['suggestion_content'] and 'instead' in suggestion['suggestion_content'] and 'let' in suggestion['suggestion_content']: - get_logger().debug(f"Skipping suggestion {i + 1}, because it uses 'const instead let': {suggestion}") + if 'const' in suggestion['suggestion_content'] and 'instead' in suggestion[ + 'suggestion_content'] and 'let' in suggestion['suggestion_content']: + get_logger().debug( + f"Skipping suggestion {i + 1}, because it uses 'const instead let': {suggestion}") continue if ('existing_code' in suggestion) and ('improved_code' in suggestion): @@ -258,7 +265,7 @@ class PRCodeSuggestions: get_logger().debug( f"edited improved suggestion {i + 1}, because equal to existing code: {suggestion['existing_code']}") if get_settings().pr_code_suggestions.commitable_code_suggestions: - suggestion['improved_code'] = "" # we need 'existing_code' to locate the code in the PR + suggestion['improved_code'] = "" # we need 'existing_code' to locate the code in the PR else: suggestion['existing_code'] = "" suggestion = self._truncate_if_needed(suggestion) @@ -279,12 +286,15 @@ class PRCodeSuggestions: if not data['code_suggestions']: get_logger().info('No suggestions found to improve this PR.') if self.progress_response: - return self.git_provider.edit_comment(self.progress_response, body='No suggestions found to improve this PR.') + return self.git_provider.edit_comment(self.progress_response, + body='No suggestions found to improve this PR.') else: return self.git_provider.publish_comment('No suggestions found to improve this PR.') for d in data['code_suggestions']: try: + if get_settings().config.verbosity_level >= 2: + get_logger().info(f"suggestion: {d}") relevant_file = d['relevant_file'].strip() relevant_lines_start = int(d['relevant_lines_start']) # absolute position relevant_lines_end = int(d['relevant_lines_end']) @@ -300,8 +310,8 @@ class PRCodeSuggestions: else: body = f"**Suggestion:** {content} [{label}]\n```suggestion\n" + new_code_snippet + "\n```" code_suggestions.append({'body': body, 'relevant_file': relevant_file, - 'relevant_lines_start': relevant_lines_start, - 'relevant_lines_end': relevant_lines_end}) + 'relevant_lines_start': relevant_lines_start, + 'relevant_lines_end': relevant_lines_end}) except Exception: get_logger().info(f"Could not parse suggestion: {d}") @@ -477,14 +487,15 @@ class PRCodeSuggestions: # sort suggestions_labels by the suggestion with the highest score if get_settings().pr_code_suggestions.self_reflect_on_suggestions: - suggestions_labels = dict(sorted(suggestions_labels.items(), key=lambda x: max([s['score'] for s in x[1]]), reverse=True)) + suggestions_labels = dict( + sorted(suggestions_labels.items(), key=lambda x: max([s['score'] for s in x[1]]), reverse=True)) # sort the suggestions inside each label group by score for label, suggestions in suggestions_labels.items(): suggestions_labels[label] = sorted(suggestions, key=lambda x: x['score'], reverse=True) - + counter_suggestions = 0 for label, suggestions in suggestions_labels.items(): - num_suggestions=len(suggestions) + num_suggestions = len(suggestions) pr_body += f"""{label.capitalize()}\n""" for i, suggestion in enumerate(suggestions): @@ -508,8 +519,8 @@ class PRCodeSuggestions: suggestion_content = insert_br_after_x_chars(suggestion_content, 90) # pr_body += f"
{suggestion_content}" - existing_code = suggestion['existing_code'].rstrip()+"\n" - improved_code = suggestion['improved_code'].rstrip()+"\n" + existing_code = suggestion['existing_code'].rstrip() + "\n" + improved_code = suggestion['improved_code'].rstrip() + "\n" diff = difflib.unified_diff(existing_code.split('\n'), improved_code.split('\n'), n=999) @@ -518,7 +529,7 @@ class PRCodeSuggestions: example_code = "" example_code += f"```diff\n{patch}\n```\n" - if i==0: + if i == 0: pr_body += f"""\n\n""" else: pr_body += f"""\n\n""" @@ -529,13 +540,20 @@ class PRCodeSuggestions: pr_body += f"""\n\n
{suggestion_summary}\n\n___\n\n""" pr_body += f""" **{suggestion_content}** - + [{relevant_file} {range_str}]({code_snippet_link}) -{example_code} +{example_code.rstrip()} """ + if (get_settings().pr_code_suggestions.apply_suggestions_checkbox and + (isinstance(self.git_provider, GithubProvider) or isinstance(self.git_provider, + GitLabProvider))): + # add a checkbox line, to create a committal suggestion from the table suggestion + if '...' not in patch: + pr_body += f"""\n- [ ] **Apply this suggestion** \n\n""" + if get_settings().pr_code_suggestions.self_reflect_on_suggestions: - pr_body +=f"\n\n
Suggestion importance[1-10]: {suggestion['score']}\n\n" + pr_body += f"
Suggestion importance[1-10]: {suggestion['score']}\n\n" pr_body += f"Why: {suggestion['score_why']}\n\n" pr_body += f"
" @@ -546,7 +564,7 @@ class PRCodeSuggestions: pr_body += f"{suggestion['score']}\n\n" pr_body += f"" - + counter_suggestions += 1 # pr_body += "
" # pr_body += """""" @@ -570,14 +588,54 @@ class PRCodeSuggestions: "diff": patches_diff, 'num_code_suggestions': len(suggestion_list)} environment = Environment(undefined=StrictUndefined) - system_prompt_reflect = environment.from_string(get_settings().pr_code_suggestions_reflect_prompt.system).render( + system_prompt_reflect = environment.from_string( + get_settings().pr_code_suggestions_reflect_prompt.system).render( variables) - user_prompt_reflect = environment.from_string(get_settings().pr_code_suggestions_reflect_prompt.user).render(variables) + user_prompt_reflect = environment.from_string( + get_settings().pr_code_suggestions_reflect_prompt.user).render(variables) with get_logger().contextualize(command="self_reflect_on_suggestions"): response_reflect, finish_reason_reflect = await self.ai_handler.chat_completion(model=model, - system=system_prompt_reflect, - user=user_prompt_reflect) + system=system_prompt_reflect, + user=user_prompt_reflect) except Exception as e: get_logger().info(f"Could not reflect on suggestions, error: {e}") return "" - return response_reflect \ No newline at end of file + return response_reflect + + async def handle_apply_suggestion(self): + try: + get_logger().info('Processing "apply" suggestion...') + suggestion_number = get_settings().apply_suggestion + comment_after = get_settings().pr_code_suggestions.get('comment_after', None) + if suggestion_number is None or comment_after is None: + get_logger().error('Invalid suggestion number or comment_after') + return False + suggestions = parse_suggestions_content(comment_after) + if not suggestions: + get_logger().error('Failed to parse suggestions') + return False + suggestion = suggestions[suggestion_number] + if hasattr(self, 'main_language'): + self.git_provider.main_language = self.main_language + relevant_file = suggestion['suggestion_orig_location']['filename'] + relevant_lines_start = int(suggestion['suggestion_orig_location']['start_line']) + relevant_lines_end = int(suggestion['suggestion_orig_location']['end_line']) + content = suggestion['suggestion_summary'] + new_code_snippet = suggestion['new_code_snippet'] + label = suggestion['category'] + score = suggestion['score'] + if new_code_snippet: + new_code_snippet = self.dedent_code(relevant_file, relevant_lines_start, new_code_snippet) + body = f"**Suggestion:** {content} [{label}, importance: {score}]\n```suggestion\n" + new_code_snippet + "\n```" + original_suggestion = suggestion + code_suggestions = [({'original_suggestion': original_suggestion, + 'body': body, 'relevant_file': relevant_file, + 'relevant_lines_start': relevant_lines_start, + 'relevant_lines_end': relevant_lines_end})] + is_successful = self.git_provider.publish_code_suggestions(code_suggestions) + get_settings().set("suggestion_score", score) + get_settings().set("suggestion_label", label) + except Exception as e: + get_logger().info(f"Failed to apply suggestion, error: {e}") + is_successful = False + return is_successful diff --git a/pr_agent/tools/pr_description.py b/pr_agent/tools/pr_description.py index fff0c3f5..795e4a2f 100644 --- a/pr_agent/tools/pr_description.py +++ b/pr_agent/tools/pr_description.py @@ -7,11 +7,14 @@ from jinja2 import Environment, StrictUndefined from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler -from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models +from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, get_pr_diff_multiple_patchs, \ + OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD from pr_agent.algo.token_handler import TokenHandler -from pr_agent.algo.utils import load_yaml, set_custom_labels, get_user_labels, ModelType, show_relevant_configurations +from pr_agent.algo.utils import set_custom_labels +from pr_agent.algo.utils import load_yaml, get_user_labels, ModelType, show_relevant_configurations, get_max_tokens, \ + clip_tokens from pr_agent.config_loader import get_settings -from pr_agent.git_providers import get_git_provider, get_git_provider_with_context +from pr_agent.git_providers import get_git_provider, GithubProvider, get_git_provider_with_context from pr_agent.git_providers.git_provider import get_main_pr_language from pr_agent.log import get_logger from pr_agent.servers.help import HelpMessage @@ -56,6 +59,7 @@ class PRDescription: "custom_labels_class": "", # will be filled if necessary in 'set_custom_labels' function "enable_semantic_files_types": get_settings().pr_description.enable_semantic_files_types, } + self.user_description = self.git_provider.get_user_description() # Initialize the token handler @@ -163,32 +167,105 @@ class PRDescription: if get_settings().pr_description.use_description_markers and 'pr_agent:' not in self.user_description: return None - self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model) - if self.patches_diff: - get_logger().debug(f"PR diff", artifact=self.patches_diff) - self.prediction = await self._get_prediction(model) + large_pr_handling = get_settings().pr_description.enable_large_pr_handling and "pr_description_only_files_prompts" in get_settings() + patches_diff = get_pr_diff(self.git_provider, self.token_handler, model, large_pr_handling=large_pr_handling) + if not large_pr_handling or patches_diff: + self.patches_diff = patches_diff + if patches_diff: + get_logger().debug(f"PR diff", artifact=self.patches_diff) + self.prediction = await self._get_prediction(model, patches_diff, prompt="pr_description_prompt") + else: + get_logger().error(f"Error getting PR diff {self.pr_id}") + self.prediction = None else: - get_logger().error(f"Error getting PR diff {self.pr_id}") - self.prediction = None + # get the diff in multiple patches, with the token handler only for the files prompt + get_logger().debug('large_pr_handling for describe') + token_handler_only_files_prompt = TokenHandler( + self.git_provider.pr, + self.vars, + get_settings().pr_description_only_files_prompts.system, + get_settings().pr_description_only_files_prompts.user, + ) + (patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, + files_in_patches_list) = get_pr_diff_multiple_patchs( + self.git_provider, token_handler_only_files_prompt, model) - async def _get_prediction(self, model: str) -> str: - """ - Generate an AI prediction for the PR description based on the provided model. + # get the files prediction for each patch + file_description_str_list = [] + for i, patches in enumerate(patches_compressed_list): + patches_diff = "\n".join(patches) + get_logger().debug(f"PR diff number {i + 1} for describe files") + prediction_files = await self._get_prediction(model, patches_diff, + prompt="pr_description_only_files_prompts") + prediction_files = prediction_files.strip().removeprefix('```yaml').strip('`').strip() + if load_yaml(prediction_files) and prediction_files.startswith('pr_files'): + prediction_files = prediction_files.removeprefix('pr_files:').strip() + file_description_str_list.append(prediction_files) + else: + get_logger().debug(f"failed to generate predictions in iteration {i + 1} for describe files") - Args: - model (str): The name of the model to be used for generating the prediction. + # generate files_walkthrough string, with proper token handling + token_handler_only_description_prompt = TokenHandler( + self.git_provider.pr, + self.vars, + get_settings().pr_description_only_description_prompts.system, + get_settings().pr_description_only_description_prompts.user) + files_walkthrough = "\n".join(file_description_str_list) + if remaining_files_list: + files_walkthrough += "\n\nNo more token budget. Additional unprocessed files:" + for file in remaining_files_list: + files_walkthrough += f"\n- {file}" + if deleted_files_list: + files_walkthrough += "\n\nAdditional deleted files:" + for file in deleted_files_list: + files_walkthrough += f"\n- {file}" + tokens_files_walkthrough = len(token_handler_only_description_prompt.encoder.encode(files_walkthrough)) + total_tokens = token_handler_only_description_prompt.prompt_tokens + tokens_files_walkthrough + max_tokens_model = get_max_tokens(model) + if total_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD: + # clip files_walkthrough to git the tokens within the limit + files_walkthrough = clip_tokens(files_walkthrough, + max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD - token_handler_only_description_prompt.prompt_tokens, + num_input_tokens=tokens_files_walkthrough) - Returns: - str: The generated AI prediction. - """ + # PR header inference + # toDo - add deleted and unprocessed files to the prompt ('files_walkthrough'), as extra data + get_logger().debug(f"PR diff only description", artifact=files_walkthrough) + prediction_headers = await self._get_prediction(model, patches_diff=files_walkthrough, + prompt="pr_description_only_description_prompts") + prediction_headers = prediction_headers.strip().removeprefix('```yaml').strip('`').strip() + if get_settings().pr_description.mention_extra_files: + for file in remaining_files_list: + extra_file_yaml = f"""\ +- filename: | + {file} + changes_summary: | + ... + changes_title: | + ... + label: | + not processed (token-limit) +""" + files_walkthrough = files_walkthrough.strip() + "\n" + extra_file_yaml.strip() + + # final processing + self.prediction = prediction_headers + "\n" + "pr_files:\n" + files_walkthrough + if not load_yaml(self.prediction): + get_logger().error(f"Error getting valid YAML in large PR handling for describe {self.pr_id}") + if load_yaml(prediction_headers): + get_logger().debug(f"Using only headers for describe {self.pr_id}") + self.prediction = prediction_headers + + async def _get_prediction(self, model: str, patches_diff: str, prompt="pr_description_prompt") -> str: variables = copy.deepcopy(self.vars) - variables["diff"] = self.patches_diff # update diff + variables["diff"] = patches_diff # update diff environment = Environment(undefined=StrictUndefined) set_custom_labels(variables, self.git_provider) self.variables = variables - system_prompt = environment.from_string(get_settings().pr_description_prompt.system).render(variables) - user_prompt = environment.from_string(get_settings().pr_description_prompt.user).render(variables) + + system_prompt = environment.from_string(get_settings().get(prompt, {}).get("system", "")).render(variables) + user_prompt = environment.from_string(get_settings().get(prompt, {}).get("user", "")).render(variables) response, finish_reason = await self.ai_handler.chat_completion( model=model, @@ -351,7 +428,7 @@ class PRDescription: filename = file['filename'].replace("'", "`").replace('"', '`') changes_summary = file['changes_summary'] changes_title = file['changes_title'].strip() - label = file.get('label') + label = file.get('label').strip().lower() if label not in file_label_dict: file_label_dict[label] = [] file_label_dict[label].append((filename, changes_title, changes_summary)) @@ -392,6 +469,7 @@ class PRDescription: for filename, file_changes_title, file_change_description in list_tuples: filename = filename.replace("'", "`").rstrip() filename_publish = filename.split("/")[-1] + file_changes_title_code = f"{file_changes_title}" file_changes_title_code_br = insert_br_after_x_chars(file_changes_title_code, x=(delta - 5)).strip() if len(file_changes_title_code_br) < (delta - 5): @@ -423,14 +501,16 @@ class PRDescription:
{filename} + {file_change_description_br}
- + {diff_plus_minus}{delta_nbsp} + """ if use_collapsible_file_list: