diff --git a/docs/docs/tools/describe.md b/docs/docs/tools/describe.md
index f5357dee..9fe50e66 100644
--- a/docs/docs/tools/describe.md
+++ b/docs/docs/tools/describe.md
@@ -87,6 +87,10 @@ publish_labels = ...
collapsible_file_list |
If set to true, the file list in the "Changes walkthrough" section will be collapsible. If set to "adaptive", the file list will be collapsible only if there are more than 8 files. Default is "adaptive". |
+
+ enable_large_pr_handling |
+ Pro feature. If set to true, in case of a large PR the tool will make several calls to the AI and combine them to be able to cover more files. Default is true. |
+
enable_help_text |
If set to true, the tool will display a help text in the comment. Default is false. |
diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py
index 731af5a2..818e98ff 100644
--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@@ -24,26 +24,10 @@ ADDED_FILES_ = "Additional added files (insufficient token budget to process):\n
OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD = 1000
OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600
+
+
def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: str,
- add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False) -> str:
- """
- Returns a string with the diff of the pull request, applying diff minimization techniques if needed.
-
- Args:
- git_provider (GitProvider): An object of the GitProvider class representing the Git provider used for the pull
- request.
- token_handler (TokenHandler): An object of the TokenHandler class used for handling tokens in the context of the
- pull request.
- model (str): The name of the model used for tokenization.
- add_line_numbers_to_hunks (bool, optional): A boolean indicating whether to add line numbers to the hunks in the
- diff. Defaults to False.
- disable_extra_lines (bool, optional): A boolean indicating whether to disable the extension of each patch with
- extra lines of context. Defaults to False.
-
- Returns:
- str: A string with the diff of the pull request, applying diff minimization techniques if needed.
- """
-
+ add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False, large_pr_handling=False) -> str:
if disable_extra_lines:
PATCH_EXTRA_LINES = 0
else:
@@ -87,39 +71,99 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s
# if we are over the limit, start pruning
get_logger().info(f"Tokens: {total_tokens}, total tokens over limit: {get_max_tokens(model)}, "
f"pruning diff.")
- patches_compressed, modified_file_names, deleted_file_names, added_file_names, total_tokens_new = \
+ patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list = \
pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks)
+ if large_pr_handling and len(patches_compressed_list) > 1:
+ get_logger().info(f"Large PR handling mode, and found {len(patches_compressed_list)} patches with original diff.")
+ return "" # return empty string, as we generate multiple patches with a different prompt
+
+ # return the first patch
+ patches_compressed = patches_compressed_list[0]
+ total_tokens_new = total_tokens_list[0]
+ files_in_patch = files_in_patches_list[0]
+
# Insert additional information about added, modified, and deleted files if there is enough space
max_tokens = get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD
curr_token = total_tokens_new # == token_handler.count_tokens(final_diff)+token_handler.prompt_tokens
final_diff = "\n".join(patches_compressed)
delta_tokens = 10
- if added_file_names and (max_tokens - curr_token) > delta_tokens:
- added_list_str = ADDED_FILES_ + "\n".join(added_file_names)
- added_list_str = clip_tokens(added_list_str, max_tokens - curr_token)
- if added_list_str:
- final_diff = final_diff + "\n\n" + added_list_str
- curr_token += token_handler.count_tokens(added_list_str) + 2
- if modified_file_names and (max_tokens - curr_token) > delta_tokens:
- modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names)
- modified_list_str = clip_tokens(modified_list_str, max_tokens - curr_token)
- if modified_list_str:
- final_diff = final_diff + "\n\n" + modified_list_str
- curr_token += token_handler.count_tokens(modified_list_str) + 2
- if deleted_file_names and (max_tokens - curr_token) > delta_tokens:
- deleted_list_str = DELETED_FILES_ + "\n".join(deleted_file_names)
- deleted_list_str = clip_tokens(deleted_list_str, max_tokens - curr_token)
- if deleted_list_str:
- final_diff = final_diff + "\n\n" + deleted_list_str
- try:
- get_logger().debug(f"After pruning, added_list_str: {added_list_str}, modified_list_str: {modified_list_str}, "
- f"deleted_list_str: {deleted_list_str}")
- except Exception as e:
- pass
+ added_list_str = modified_list_str = deleted_list_str = ""
+ unprocessed_files = []
+ # generate the added, modified, and deleted files lists
+ if (max_tokens - curr_token) > delta_tokens:
+ for filename, file_values in file_dict.items():
+ if filename in files_in_patch:
+ continue
+ if file_values['edit_type'] == EDIT_TYPE.ADDED:
+ unprocessed_files.append(filename)
+ if not added_list_str:
+ added_list_str = ADDED_FILES_ + f"\n{filename}"
+ else:
+ added_list_str = added_list_str + f"\n{filename}"
+ elif file_values['edit_type'] == EDIT_TYPE.MODIFIED or EDIT_TYPE.RENAMED:
+ unprocessed_files.append(filename)
+ if not modified_list_str:
+ modified_list_str = MORE_MODIFIED_FILES_ + f"\n{filename}"
+ else:
+ modified_list_str = modified_list_str + f"\n{filename}"
+ elif file_values['edit_type'] == EDIT_TYPE.DELETED:
+ # unprocessed_files.append(filename) # not needed here, because the file was deleted, so no need to process it
+ if not deleted_list_str:
+ deleted_list_str = DELETED_FILES_ + f"\n{filename}"
+ else:
+ deleted_list_str = deleted_list_str + f"\n{filename}"
+
+ # prune the added, modified, and deleted files lists, and add them to the final diff
+ added_list_str = clip_tokens(added_list_str, max_tokens - curr_token)
+ if added_list_str:
+ final_diff = final_diff + "\n\n" + added_list_str
+ curr_token += token_handler.count_tokens(added_list_str) + 2
+ modified_list_str = clip_tokens(modified_list_str, max_tokens - curr_token)
+ if modified_list_str:
+ final_diff = final_diff + "\n\n" + modified_list_str
+ curr_token += token_handler.count_tokens(modified_list_str) + 2
+ deleted_list_str = clip_tokens(deleted_list_str, max_tokens - curr_token)
+ if deleted_list_str:
+ final_diff = final_diff + "\n\n" + deleted_list_str
+
+ get_logger().debug(f"After pruning, added_list_str: {added_list_str}, modified_list_str: {modified_list_str}, "
+ f"deleted_list_str: {deleted_list_str}")
return final_diff
+def get_pr_diff_multiple_patchs(git_provider: GitProvider, token_handler: TokenHandler, model: str,
+ add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False):
+ try:
+ diff_files_original = git_provider.get_diff_files()
+ except RateLimitExceededException as e:
+ get_logger().error(f"Rate limit exceeded for git provider API. original message {e}")
+ raise
+
+ diff_files = filter_ignored(diff_files_original)
+ if diff_files != diff_files_original:
+ try:
+ get_logger().info(f"Filtered out {len(diff_files_original) - len(diff_files)} files")
+ new_names = set([a.filename for a in diff_files])
+ orig_names = set([a.filename for a in diff_files_original])
+ get_logger().info(f"Filtered out files: {orig_names - new_names}")
+ except Exception as e:
+ pass
+
+ # get pr languages
+ pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files)
+ if pr_languages:
+ try:
+ get_logger().info(f"PR main language: {pr_languages[0]['language']}")
+ except Exception as e:
+ pass
+
+ patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list = \
+ pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks)
+
+ return patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list
+
+
def pr_generate_extended_diff(pr_languages: list,
token_handler: TokenHandler,
add_line_numbers_to_hunks: bool,
@@ -164,41 +208,16 @@ def pr_generate_extended_diff(pr_languages: list,
def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, model: str,
- convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list, list, int]:
- """
- Generate a compressed diff string for a pull request, using diff minimization techniques to reduce the number of
- tokens used.
- Args:
- top_langs (list): A list of dictionaries representing the languages used in the pull request and their
- corresponding files.
- token_handler (TokenHandler): An object of the TokenHandler class used for handling tokens in the context of the
- pull request.
- model (str): The model used for tokenization.
- convert_hunks_to_line_numbers (bool): A boolean indicating whether to convert hunks to line numbers in the diff.
- Returns:
- Tuple[list, list, list]: A tuple containing the following lists:
- - patches: A list of compressed diff patches for each file in the pull request.
- - modified_files_list: A list of file names that were skipped due to large patch size.
- - deleted_files_list: A list of file names that were deleted in the pull request.
-
- Minimization techniques to reduce the number of tokens:
- 0. Start from the largest diff patch to smaller ones
- 1. Don't use extend context lines around diff
- 2. Minimize deleted files
- 3. Minimize deleted hunks
- 4. Minimize all remaining files when you reach token limit
- """
-
- patches = []
- added_files_list = []
- modified_files_list = []
+ convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list, list, dict, list]:
deleted_files_list = []
+
# sort each one of the languages in top_langs by the number of tokens in the diff
sorted_files = []
for lang in top_langs:
sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True))
- total_tokens = token_handler.prompt_tokens
+ # generate patches for each file, and count tokens
+ file_dict = {}
for file in sorted_files:
original_file_content_str = file.base_file
new_file_content_str = file.head_file
@@ -210,55 +229,85 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
patch = handle_patch_deletions(patch, original_file_content_str,
new_file_content_str, file.filename, file.edit_type)
if patch is None:
- # if not deleted_files_list:
- # total_tokens += token_handler.count_tokens(DELETED_FILES_)
if file.filename not in deleted_files_list:
deleted_files_list.append(file.filename)
- # total_tokens += token_handler.count_tokens(file.filename) + 1
continue
if convert_hunks_to_line_numbers:
patch = convert_to_hunks_with_lines_numbers(patch, file)
new_patch_tokens = token_handler.count_tokens(patch)
+ file_dict[file.filename] = {'patch': patch, 'tokens': new_patch_tokens, 'edit_type': file.edit_type}
+
+ max_tokens_model = get_max_tokens(model)
+
+ # first iteration
+ files_in_patches_list = []
+ remaining_files_list = [file.filename for file in sorted_files]
+ patches_list =[]
+ total_tokens_list = []
+ total_tokens, patches, remaining_files_list, files_in_patch_list = generate_full_patch(convert_hunks_to_line_numbers, file_dict,
+ max_tokens_model, remaining_files_list, token_handler)
+ patches_list.append(patches)
+ total_tokens_list.append(total_tokens)
+ files_in_patches_list.append(files_in_patch_list)
+
+ # additional iterations (if needed)
+ NUMBER_OF_ALLOWED_ITERATIONS = get_settings().pr_description.max_ai_calls - 1 # one more call is to summarize
+ for i in range(NUMBER_OF_ALLOWED_ITERATIONS-1):
+ if remaining_files_list:
+ total_tokens, patches, remaining_files_list, files_in_patch_list = generate_full_patch(convert_hunks_to_line_numbers,
+ file_dict,
+ max_tokens_model,
+ remaining_files_list, token_handler)
+ patches_list.append(patches)
+ total_tokens_list.append(total_tokens)
+ files_in_patches_list.append(files_in_patch_list)
+ else:
+ break
+
+ return patches_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list
+
+
+def generate_full_patch(convert_hunks_to_line_numbers, file_dict, max_tokens_model,remaining_files_list_prev, token_handler):
+ total_tokens = token_handler.prompt_tokens # initial tokens
+ patches = []
+ remaining_files_list_new = []
+ files_in_patch_list = []
+ for filename, data in file_dict.items():
+ if filename not in remaining_files_list_prev:
+ continue
+
+ patch = data['patch']
+ new_patch_tokens = data['tokens']
+ edit_type = data['edit_type']
# Hard Stop, no more tokens
- if total_tokens > get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD:
- get_logger().warning(f"File was fully skipped, no more tokens: {file.filename}.")
+ if total_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD:
+ get_logger().warning(f"File was fully skipped, no more tokens: {filename}.")
continue
# If the patch is too large, just show the file name
- if total_tokens + new_patch_tokens > get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD:
+ if total_tokens + new_patch_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD:
# Current logic is to skip the patch if it's too large
# TODO: Option for alternative logic to remove hunks from the patch to reduce the number of tokens
# until we meet the requirements
if get_settings().config.verbosity_level >= 2:
- get_logger().warning(f"Patch too large, minimizing it, {file.filename}")
- if file.edit_type == EDIT_TYPE.ADDED:
- # if not added_files_list:
- # total_tokens += token_handler.count_tokens(ADDED_FILES_)
- if file.filename not in added_files_list:
- added_files_list.append(file.filename)
- # total_tokens += token_handler.count_tokens(file.filename) + 1
- else:
- # if not modified_files_list:
- # total_tokens += token_handler.count_tokens(MORE_MODIFIED_FILES_)
- if file.filename not in modified_files_list:
- modified_files_list.append(file.filename)
- # total_tokens += token_handler.count_tokens(file.filename) + 1
+ get_logger().warning(f"Patch too large, skipping it, {filename}")
+ remaining_files_list_new.append(filename)
continue
if patch:
if not convert_hunks_to_line_numbers:
- patch_final = f"\n\n## file: '{file.filename.strip()}\n\n{patch.strip()}\n'"
+ patch_final = f"\n\n## file: '{filename.strip()}\n\n{patch.strip()}\n'"
else:
patch_final = "\n\n" + patch.strip()
patches.append(patch_final)
total_tokens += token_handler.count_tokens(patch_final)
+ files_in_patch_list.append(filename)
if get_settings().config.verbosity_level >= 2:
- get_logger().info(f"Tokens: {total_tokens}, last filename: {file.filename}")
-
- return patches, modified_files_list, deleted_files_list, added_files_list, total_tokens
+ get_logger().info(f"Tokens: {total_tokens}, last filename: {filename}")
+ return total_tokens, patches, remaining_files_list_new, files_in_patch_list
async def retry_with_fallback_models(f: Callable, model_type: ModelType = ModelType.REGULAR):
@@ -417,4 +466,4 @@ def get_pr_multi_diffs(git_provider: GitProvider,
final_diff = "\n".join(patches)
final_diff_list.append(final_diff)
- return final_diff_list
\ No newline at end of file
+ return final_diff_list
diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml
index 2154ec5e..76521939 100644
--- a/pr_agent/settings/configuration.toml
+++ b/pr_agent/settings/configuration.toml
@@ -74,7 +74,10 @@ inline_file_summary=false # false, true, 'table'
# markers
use_description_markers=false
include_generated_by_header=true
-
+# large pr mode 💎
+enable_large_pr_handling=true
+max_ai_calls=3
+mention_extra_files=true
#custom_labels = ['Bug fix', 'Tests', 'Bug fix with tests', 'Enhancement', 'Documentation', 'Other']
[pr_questions] # /ask #
@@ -82,7 +85,7 @@ enable_help_text=false
[pr_code_suggestions] # /improve #
-max_context_tokens=8000
+max_context_tokens=10000
num_code_suggestions=4
commitable_code_suggestions = false
extra_instructions = ""
@@ -105,16 +108,34 @@ final_clip_factor = 0.8
demand_code_suggestions_self_review=false # add a checkbox for the author to self-review the code suggestions
code_suggestions_self_review_text= "**Author self-review**: I have reviewed the PR code suggestions, and addressed the relevant ones."
approve_pr_on_self_review=false # Pro feature. if true, the PR will be auto-approved after the author clicks on the self-review checkbox
+# Suggestion impact
+publish_post_process_suggestion_impact=true
+
+[pr_custom_prompt] # /custom_prompt #
+prompt = """\
+The code suggestions should focus only on the following:
+- ...
+- ...
+...
+"""
+suggestions_score_threshold=0
+num_code_suggestions_per_chunk=4
+self_reflect_on_custom_suggestions=true
+enable_help_text=false
+
[pr_add_docs] # /add_docs #
extra_instructions = ""
-docs_style = "Sphinx Style" # "Google Style with Args, Returns, Attributes...etc", "Numpy Style", "Sphinx Style", "PEP257", "reStructuredText"
+docs_style = "Sphinx" # "Google Style with Args, Returns, Attributes...etc", "Numpy Style", "Sphinx Style", "PEP257", "reStructuredText"
+file = "" # in case there are several components with the same name, you can specify the relevant file
+class_name = "" # in case there are several methods with the same name in the same file, you can specify the relevant class name
[pr_update_changelog] # /update_changelog #
push_changelog_changes=false
extra_instructions = ""
[pr_analyze] # /analyze #
+enable_help_text=true
[pr_test] # /test #
extra_instructions = ""
@@ -129,13 +150,14 @@ enable_help_text=false
num_code_suggestions=4
extra_instructions = ""
file = "" # in case there are several components with the same name, you can specify the relevant file
-class_name = ""
+class_name = "" # in case there are several methods with the same name in the same file, you can specify the relevant class name
[checks] # /checks (pro feature) #
enable_auto_checks_feedback=true
excluded_checks_list=["lint"] # list of checks to exclude, for example: ["check1", "check2"]
persistent_comment=true
enable_help_text=true
+final_update_message = false
[pr_help] # /help #
@@ -148,15 +170,16 @@ ratelimit_retries = 5
base_url = "https://api.github.com"
publish_inline_comments_fallback_with_verification = true
try_fix_invalid_inline_comments = true
+app_name = "pr-agent"
[github_action_config]
# auto_review = true # set as env var in .github/workflows/pr-agent.yaml
# auto_describe = true # set as env var in .github/workflows/pr-agent.yaml
# auto_improve = true # set as env var in .github/workflows/pr-agent.yaml
-# enable_output = true # set as env var in .github/workflows/pr-agent.yaml
[github_app]
# these toggles allows running the github app from custom deployments
+bot_user = "github-actions[bot]"
override_deployment_type = true
# settings for "pull_request" event
handle_pr_actions = ['opened', 'reopened', 'ready_for_review']
@@ -180,7 +203,14 @@ ignore_pr_title = []
ignore_bot_pr = true
[gitlab]
-url = "https://gitlab.com" # URL to the gitlab service
+# URL to the gitlab service
+url = "https://gitlab.com"
+# Polling (either project id or namespace/project_name) syntax can be used
+projects_to_monitor = ['org_name/repo_name']
+# Polling trigger
+magic_word = "AutoReview"
+# Polling interval
+polling_interval_seconds = 30
pr_commands = [
"/describe",
"/review --pr_reviewer.num_code_suggestions=0",
@@ -229,10 +259,18 @@ force_update_dataset = false
max_issues_to_scan = 500
vectordb = "pinecone"
+[pr_find_similar_component]
+class_name = ""
+file = ""
+search_from_org = false
+allow_fallback_less_words = true
+number_of_keywords = 5
+number_of_results = 5
+
[pinecone]
# fill and place in .secrets.toml
#api_key = ...
# environment = "gcp-starter"
[lancedb]
-uri = "./lancedb"
\ No newline at end of file
+uri = "./lancedb"
diff --git a/pr_agent/settings/pr_description_prompts.toml b/pr_agent/settings/pr_description_prompts.toml
index 80df6d60..9c58eb3d 100644
--- a/pr_agent/settings/pr_description_prompts.toml
+++ b/pr_agent/settings/pr_description_prompts.toml
@@ -37,7 +37,7 @@ class PRType(str, Enum):
{%- if enable_semantic_files_types %}
-Class FileDescription(BaseModel):
+class FileDescription(BaseModel):
filename: str = Field(description="the relevant file full path")
language: str = Field(description="the relevant file language")
changes_summary: str = Field(description="concise summary of the changes in the relevant file, in bullet points (1-4 bullet points).")
@@ -45,7 +45,7 @@ Class FileDescription(BaseModel):
label: str = Field(description="a single semantic label that represents a type of code changes that occurred in the File. Possible values (partial list): 'bug fix', 'tests', 'enhancement', 'documentation', 'error handling', 'configuration changes', 'dependencies', 'formatting', 'miscellaneous', ...")
{%- endif %}
-Class PRDescription(BaseModel):
+class PRDescription(BaseModel):
type: List[PRType] = Field(description="one or more types that describe the PR content. Return the label member value (e.g. 'Bug fix', not 'bug_fix')")
{%- if enable_semantic_files_types %}
pr_files[List[FileDescription]] = Field(max_items=15, description="a list of the files in the PR, and their changes summary.")
diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py
index 777ffd99..d08c723b 100644
--- a/pr_agent/tools/pr_code_suggestions.py
+++ b/pr_agent/tools/pr_code_suggestions.py
@@ -35,7 +35,6 @@ class PRCodeSuggestions:
get_logger().info(f"Setting max_model_tokens to {MAX_CONTEXT_TOKENS_IMPROVE} for PR improve")
get_settings().config.max_model_tokens = MAX_CONTEXT_TOKENS_IMPROVE
-
# extended mode
try:
self.is_extended = self._get_is_extended(args or [])
@@ -116,7 +115,7 @@ class PRCodeSuggestions:
# require self-review
if get_settings().pr_code_suggestions.demand_code_suggestions_self_review:
- text= get_settings().pr_code_suggestions.code_suggestions_self_review_text
+ text = get_settings().pr_code_suggestions.code_suggestions_self_review_text
pr_body += f"\n\n- [ ] {text}"
if get_settings().pr_code_suggestions.approve_pr_on_self_review:
pr_body += ' '
@@ -193,8 +192,9 @@ class PRCodeSuggestions:
# self-reflect on suggestions
if get_settings().pr_code_suggestions.self_reflect_on_suggestions:
- model = get_settings().config.model_turbo # use turbo model for self-reflection, since it is an easier task
- response_reflect = await self.self_reflect_on_suggestions(data["code_suggestions"], patches_diff, model=model)
+ model = get_settings().config.model_turbo # use turbo model for self-reflection, since it is an easier task
+ response_reflect = await self.self_reflect_on_suggestions(data["code_suggestions"], patches_diff,
+ model=model)
if response_reflect:
response_reflect_yaml = load_yaml(response_reflect)
code_suggestions_feedback = response_reflect_yaml["code_suggestions"]
@@ -203,7 +203,7 @@ class PRCodeSuggestions:
try:
suggestion["score"] = code_suggestions_feedback[i]["suggestion_score"]
suggestion["score_why"] = code_suggestions_feedback[i]["why"]
- except Exception as e: #
+ except Exception as e: #
get_logger().error(f"Error processing suggestion score {i}",
artifact={"suggestion": suggestion,
"code_suggestions_feedback": code_suggestions_feedback[i]})
@@ -226,7 +226,7 @@ class PRCodeSuggestions:
suggestion['improved_code'] = suggestion['improved_code'][:max_code_suggestion_length]
suggestion['improved_code'] += f"\n{suggestion_truncation_message}"
get_logger().info(f"Truncated suggestion from {len(suggestion['improved_code'])} "
- f"characters to {max_code_suggestion_length} characters")
+ f"characters to {max_code_suggestion_length} characters")
return suggestion
def _prepare_pr_code_suggestions(self, predictions: str) -> Dict:
@@ -240,17 +240,24 @@ class PRCodeSuggestions:
one_sentence_summary_list = []
for i, suggestion in enumerate(data['code_suggestions']):
try:
- if (not suggestion or 'one_sentence_summary' not in suggestion or
- 'label' not in suggestion or 'relevant_file' not in suggestion):
- get_logger().debug(f"Skipping suggestion {i + 1}, because it is invalid: {suggestion}")
+ needed_keys = ['one_sentence_summary', 'label', 'relevant_file', 'relevant_lines_start', 'relevant_lines_end']
+ is_valid_keys = True
+ for key in needed_keys:
+ if key not in suggestion:
+ is_valid_keys = False
+ get_logger().debug(f"Skipping suggestion {i + 1}, because it does not contain '{key}':\n'{suggestion}")
+ break
+ if not is_valid_keys:
continue
if suggestion['one_sentence_summary'] in one_sentence_summary_list:
get_logger().debug(f"Skipping suggestion {i + 1}, because it is a duplicate: {suggestion}")
continue
- if 'const' in suggestion['suggestion_content'] and 'instead' in suggestion['suggestion_content'] and 'let' in suggestion['suggestion_content']:
- get_logger().debug(f"Skipping suggestion {i + 1}, because it uses 'const instead let': {suggestion}")
+ if 'const' in suggestion['suggestion_content'] and 'instead' in suggestion[
+ 'suggestion_content'] and 'let' in suggestion['suggestion_content']:
+ get_logger().debug(
+ f"Skipping suggestion {i + 1}, because it uses 'const instead let': {suggestion}")
continue
if ('existing_code' in suggestion) and ('improved_code' in suggestion):
@@ -258,7 +265,7 @@ class PRCodeSuggestions:
get_logger().debug(
f"edited improved suggestion {i + 1}, because equal to existing code: {suggestion['existing_code']}")
if get_settings().pr_code_suggestions.commitable_code_suggestions:
- suggestion['improved_code'] = "" # we need 'existing_code' to locate the code in the PR
+ suggestion['improved_code'] = "" # we need 'existing_code' to locate the code in the PR
else:
suggestion['existing_code'] = ""
suggestion = self._truncate_if_needed(suggestion)
@@ -279,12 +286,15 @@ class PRCodeSuggestions:
if not data['code_suggestions']:
get_logger().info('No suggestions found to improve this PR.')
if self.progress_response:
- return self.git_provider.edit_comment(self.progress_response, body='No suggestions found to improve this PR.')
+ return self.git_provider.edit_comment(self.progress_response,
+ body='No suggestions found to improve this PR.')
else:
return self.git_provider.publish_comment('No suggestions found to improve this PR.')
for d in data['code_suggestions']:
try:
+ if get_settings().config.verbosity_level >= 2:
+ get_logger().info(f"suggestion: {d}")
relevant_file = d['relevant_file'].strip()
relevant_lines_start = int(d['relevant_lines_start']) # absolute position
relevant_lines_end = int(d['relevant_lines_end'])
@@ -300,8 +310,8 @@ class PRCodeSuggestions:
else:
body = f"**Suggestion:** {content} [{label}]\n```suggestion\n" + new_code_snippet + "\n```"
code_suggestions.append({'body': body, 'relevant_file': relevant_file,
- 'relevant_lines_start': relevant_lines_start,
- 'relevant_lines_end': relevant_lines_end})
+ 'relevant_lines_start': relevant_lines_start,
+ 'relevant_lines_end': relevant_lines_end})
except Exception:
get_logger().info(f"Could not parse suggestion: {d}")
@@ -477,14 +487,15 @@ class PRCodeSuggestions:
# sort suggestions_labels by the suggestion with the highest score
if get_settings().pr_code_suggestions.self_reflect_on_suggestions:
- suggestions_labels = dict(sorted(suggestions_labels.items(), key=lambda x: max([s['score'] for s in x[1]]), reverse=True))
+ suggestions_labels = dict(
+ sorted(suggestions_labels.items(), key=lambda x: max([s['score'] for s in x[1]]), reverse=True))
# sort the suggestions inside each label group by score
for label, suggestions in suggestions_labels.items():
suggestions_labels[label] = sorted(suggestions, key=lambda x: x['score'], reverse=True)
-
+ counter_suggestions = 0
for label, suggestions in suggestions_labels.items():
- num_suggestions=len(suggestions)
+ num_suggestions = len(suggestions)
pr_body += f"""
{label.capitalize()} | \n"""
for i, suggestion in enumerate(suggestions):
@@ -508,8 +519,8 @@ class PRCodeSuggestions:
suggestion_content = insert_br_after_x_chars(suggestion_content, 90)
# pr_body += f"
{suggestion_content}"
- existing_code = suggestion['existing_code'].rstrip()+"\n"
- improved_code = suggestion['improved_code'].rstrip()+"\n"
+ existing_code = suggestion['existing_code'].rstrip() + "\n"
+ improved_code = suggestion['improved_code'].rstrip() + "\n"
diff = difflib.unified_diff(existing_code.split('\n'),
improved_code.split('\n'), n=999)
@@ -518,7 +529,7 @@ class PRCodeSuggestions:
example_code = ""
example_code += f"```diff\n{patch}\n```\n"
- if i==0:
+ if i == 0:
pr_body += f"""\n\n"""
else:
pr_body += f""" | \n\n"""
@@ -529,13 +540,20 @@ class PRCodeSuggestions:
pr_body += f"""\n\n{suggestion_summary}\n\n___\n\n"""
pr_body += f"""
**{suggestion_content}**
-
+
[{relevant_file} {range_str}]({code_snippet_link})
-{example_code}
+{example_code.rstrip()}
"""
+ if (get_settings().pr_code_suggestions.apply_suggestions_checkbox and
+ (isinstance(self.git_provider, GithubProvider) or isinstance(self.git_provider,
+ GitLabProvider))):
+ # add a checkbox line, to create a committal suggestion from the table suggestion
+ if '...' not in patch:
+ pr_body += f"""\n- [ ] **Apply this suggestion** \n\n"""
+
if get_settings().pr_code_suggestions.self_reflect_on_suggestions:
- pr_body +=f"\n\nSuggestion importance[1-10]: {suggestion['score']}\n\n"
+ pr_body += f"Suggestion importance[1-10]: {suggestion['score']}\n\n"
pr_body += f"Why: {suggestion['score_why']}\n\n"
pr_body += f" "
@@ -546,7 +564,7 @@ class PRCodeSuggestions:
pr_body += f" | {suggestion['score']}\n\n"
pr_body += f" | "
-
+ counter_suggestions += 1
# pr_body += " "
# pr_body += """ |
"""
@@ -570,14 +588,54 @@ class PRCodeSuggestions:
"diff": patches_diff,
'num_code_suggestions': len(suggestion_list)}
environment = Environment(undefined=StrictUndefined)
- system_prompt_reflect = environment.from_string(get_settings().pr_code_suggestions_reflect_prompt.system).render(
+ system_prompt_reflect = environment.from_string(
+ get_settings().pr_code_suggestions_reflect_prompt.system).render(
variables)
- user_prompt_reflect = environment.from_string(get_settings().pr_code_suggestions_reflect_prompt.user).render(variables)
+ user_prompt_reflect = environment.from_string(
+ get_settings().pr_code_suggestions_reflect_prompt.user).render(variables)
with get_logger().contextualize(command="self_reflect_on_suggestions"):
response_reflect, finish_reason_reflect = await self.ai_handler.chat_completion(model=model,
- system=system_prompt_reflect,
- user=user_prompt_reflect)
+ system=system_prompt_reflect,
+ user=user_prompt_reflect)
except Exception as e:
get_logger().info(f"Could not reflect on suggestions, error: {e}")
return ""
- return response_reflect
\ No newline at end of file
+ return response_reflect
+
+ async def handle_apply_suggestion(self):
+ try:
+ get_logger().info('Processing "apply" suggestion...')
+ suggestion_number = get_settings().apply_suggestion
+ comment_after = get_settings().pr_code_suggestions.get('comment_after', None)
+ if suggestion_number is None or comment_after is None:
+ get_logger().error('Invalid suggestion number or comment_after')
+ return False
+ suggestions = parse_suggestions_content(comment_after)
+ if not suggestions:
+ get_logger().error('Failed to parse suggestions')
+ return False
+ suggestion = suggestions[suggestion_number]
+ if hasattr(self, 'main_language'):
+ self.git_provider.main_language = self.main_language
+ relevant_file = suggestion['suggestion_orig_location']['filename']
+ relevant_lines_start = int(suggestion['suggestion_orig_location']['start_line'])
+ relevant_lines_end = int(suggestion['suggestion_orig_location']['end_line'])
+ content = suggestion['suggestion_summary']
+ new_code_snippet = suggestion['new_code_snippet']
+ label = suggestion['category']
+ score = suggestion['score']
+ if new_code_snippet:
+ new_code_snippet = self.dedent_code(relevant_file, relevant_lines_start, new_code_snippet)
+ body = f"**Suggestion:** {content} [{label}, importance: {score}]\n```suggestion\n" + new_code_snippet + "\n```"
+ original_suggestion = suggestion
+ code_suggestions = [({'original_suggestion': original_suggestion,
+ 'body': body, 'relevant_file': relevant_file,
+ 'relevant_lines_start': relevant_lines_start,
+ 'relevant_lines_end': relevant_lines_end})]
+ is_successful = self.git_provider.publish_code_suggestions(code_suggestions)
+ get_settings().set("suggestion_score", score)
+ get_settings().set("suggestion_label", label)
+ except Exception as e:
+ get_logger().info(f"Failed to apply suggestion, error: {e}")
+ is_successful = False
+ return is_successful
diff --git a/pr_agent/tools/pr_description.py b/pr_agent/tools/pr_description.py
index fff0c3f5..795e4a2f 100644
--- a/pr_agent/tools/pr_description.py
+++ b/pr_agent/tools/pr_description.py
@@ -7,11 +7,14 @@ from jinja2 import Environment, StrictUndefined
from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler
from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler
-from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models
+from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, get_pr_diff_multiple_patchs, \
+ OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD
from pr_agent.algo.token_handler import TokenHandler
-from pr_agent.algo.utils import load_yaml, set_custom_labels, get_user_labels, ModelType, show_relevant_configurations
+from pr_agent.algo.utils import set_custom_labels
+from pr_agent.algo.utils import load_yaml, get_user_labels, ModelType, show_relevant_configurations, get_max_tokens, \
+ clip_tokens
from pr_agent.config_loader import get_settings
-from pr_agent.git_providers import get_git_provider, get_git_provider_with_context
+from pr_agent.git_providers import get_git_provider, GithubProvider, get_git_provider_with_context
from pr_agent.git_providers.git_provider import get_main_pr_language
from pr_agent.log import get_logger
from pr_agent.servers.help import HelpMessage
@@ -56,6 +59,7 @@ class PRDescription:
"custom_labels_class": "", # will be filled if necessary in 'set_custom_labels' function
"enable_semantic_files_types": get_settings().pr_description.enable_semantic_files_types,
}
+
self.user_description = self.git_provider.get_user_description()
# Initialize the token handler
@@ -163,32 +167,105 @@ class PRDescription:
if get_settings().pr_description.use_description_markers and 'pr_agent:' not in self.user_description:
return None
- self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model)
- if self.patches_diff:
- get_logger().debug(f"PR diff", artifact=self.patches_diff)
- self.prediction = await self._get_prediction(model)
+ large_pr_handling = get_settings().pr_description.enable_large_pr_handling and "pr_description_only_files_prompts" in get_settings()
+ patches_diff = get_pr_diff(self.git_provider, self.token_handler, model, large_pr_handling=large_pr_handling)
+ if not large_pr_handling or patches_diff:
+ self.patches_diff = patches_diff
+ if patches_diff:
+ get_logger().debug(f"PR diff", artifact=self.patches_diff)
+ self.prediction = await self._get_prediction(model, patches_diff, prompt="pr_description_prompt")
+ else:
+ get_logger().error(f"Error getting PR diff {self.pr_id}")
+ self.prediction = None
else:
- get_logger().error(f"Error getting PR diff {self.pr_id}")
- self.prediction = None
+ # get the diff in multiple patches, with the token handler only for the files prompt
+ get_logger().debug('large_pr_handling for describe')
+ token_handler_only_files_prompt = TokenHandler(
+ self.git_provider.pr,
+ self.vars,
+ get_settings().pr_description_only_files_prompts.system,
+ get_settings().pr_description_only_files_prompts.user,
+ )
+ (patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict,
+ files_in_patches_list) = get_pr_diff_multiple_patchs(
+ self.git_provider, token_handler_only_files_prompt, model)
- async def _get_prediction(self, model: str) -> str:
- """
- Generate an AI prediction for the PR description based on the provided model.
+ # get the files prediction for each patch
+ file_description_str_list = []
+ for i, patches in enumerate(patches_compressed_list):
+ patches_diff = "\n".join(patches)
+ get_logger().debug(f"PR diff number {i + 1} for describe files")
+ prediction_files = await self._get_prediction(model, patches_diff,
+ prompt="pr_description_only_files_prompts")
+ prediction_files = prediction_files.strip().removeprefix('```yaml').strip('`').strip()
+ if load_yaml(prediction_files) and prediction_files.startswith('pr_files'):
+ prediction_files = prediction_files.removeprefix('pr_files:').strip()
+ file_description_str_list.append(prediction_files)
+ else:
+ get_logger().debug(f"failed to generate predictions in iteration {i + 1} for describe files")
- Args:
- model (str): The name of the model to be used for generating the prediction.
+ # generate files_walkthrough string, with proper token handling
+ token_handler_only_description_prompt = TokenHandler(
+ self.git_provider.pr,
+ self.vars,
+ get_settings().pr_description_only_description_prompts.system,
+ get_settings().pr_description_only_description_prompts.user)
+ files_walkthrough = "\n".join(file_description_str_list)
+ if remaining_files_list:
+ files_walkthrough += "\n\nNo more token budget. Additional unprocessed files:"
+ for file in remaining_files_list:
+ files_walkthrough += f"\n- {file}"
+ if deleted_files_list:
+ files_walkthrough += "\n\nAdditional deleted files:"
+ for file in deleted_files_list:
+ files_walkthrough += f"\n- {file}"
+ tokens_files_walkthrough = len(token_handler_only_description_prompt.encoder.encode(files_walkthrough))
+ total_tokens = token_handler_only_description_prompt.prompt_tokens + tokens_files_walkthrough
+ max_tokens_model = get_max_tokens(model)
+ if total_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD:
+ # clip files_walkthrough to git the tokens within the limit
+ files_walkthrough = clip_tokens(files_walkthrough,
+ max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD - token_handler_only_description_prompt.prompt_tokens,
+ num_input_tokens=tokens_files_walkthrough)
- Returns:
- str: The generated AI prediction.
- """
+ # PR header inference
+ # toDo - add deleted and unprocessed files to the prompt ('files_walkthrough'), as extra data
+ get_logger().debug(f"PR diff only description", artifact=files_walkthrough)
+ prediction_headers = await self._get_prediction(model, patches_diff=files_walkthrough,
+ prompt="pr_description_only_description_prompts")
+ prediction_headers = prediction_headers.strip().removeprefix('```yaml').strip('`').strip()
+ if get_settings().pr_description.mention_extra_files:
+ for file in remaining_files_list:
+ extra_file_yaml = f"""\
+- filename: |
+ {file}
+ changes_summary: |
+ ...
+ changes_title: |
+ ...
+ label: |
+ not processed (token-limit)
+"""
+ files_walkthrough = files_walkthrough.strip() + "\n" + extra_file_yaml.strip()
+
+ # final processing
+ self.prediction = prediction_headers + "\n" + "pr_files:\n" + files_walkthrough
+ if not load_yaml(self.prediction):
+ get_logger().error(f"Error getting valid YAML in large PR handling for describe {self.pr_id}")
+ if load_yaml(prediction_headers):
+ get_logger().debug(f"Using only headers for describe {self.pr_id}")
+ self.prediction = prediction_headers
+
+ async def _get_prediction(self, model: str, patches_diff: str, prompt="pr_description_prompt") -> str:
variables = copy.deepcopy(self.vars)
- variables["diff"] = self.patches_diff # update diff
+ variables["diff"] = patches_diff # update diff
environment = Environment(undefined=StrictUndefined)
set_custom_labels(variables, self.git_provider)
self.variables = variables
- system_prompt = environment.from_string(get_settings().pr_description_prompt.system).render(variables)
- user_prompt = environment.from_string(get_settings().pr_description_prompt.user).render(variables)
+
+ system_prompt = environment.from_string(get_settings().get(prompt, {}).get("system", "")).render(variables)
+ user_prompt = environment.from_string(get_settings().get(prompt, {}).get("user", "")).render(variables)
response, finish_reason = await self.ai_handler.chat_completion(
model=model,
@@ -351,7 +428,7 @@ class PRDescription:
filename = file['filename'].replace("'", "`").replace('"', '`')
changes_summary = file['changes_summary']
changes_title = file['changes_title'].strip()
- label = file.get('label')
+ label = file.get('label').strip().lower()
if label not in file_label_dict:
file_label_dict[label] = []
file_label_dict[label].append((filename, changes_title, changes_summary))
@@ -392,6 +469,7 @@ class PRDescription:
for filename, file_changes_title, file_change_description in list_tuples:
filename = filename.replace("'", "`").rstrip()
filename_publish = filename.split("/")[-1]
+
file_changes_title_code = f"{file_changes_title}
"
file_changes_title_code_br = insert_br_after_x_chars(file_changes_title_code, x=(delta - 5)).strip()
if len(file_changes_title_code_br) < (delta - 5):
@@ -423,14 +501,16 @@ class PRDescription:
{filename}
+
{file_change_description_br}
-
+
{diff_plus_minus}{delta_nbsp} |
+
"""
if use_collapsible_file_list: