From 31e91edebc2cae8d06f941c6153d2017e18d8731 Mon Sep 17 00:00:00 2001 From: zmeir Date: Thu, 17 Aug 2023 15:40:24 +0300 Subject: [PATCH 01/14] Allow keeping the original user description --- pr_agent/git_providers/bitbucket_provider.py | 10 +++------- pr_agent/git_providers/git_provider.py | 19 ++++++++++++++++++- pr_agent/git_providers/github_provider.py | 5 +---- pr_agent/git_providers/gitlab_provider.py | 5 +---- pr_agent/git_providers/local_git_provider.py | 2 +- pr_agent/settings/configuration.toml | 1 + pr_agent/tools/pr_description.py | 11 +++++++++-- 7 files changed, 34 insertions(+), 19 deletions(-) diff --git a/pr_agent/git_providers/bitbucket_provider.py b/pr_agent/git_providers/bitbucket_provider.py index 3596f4bf..e6cc74c9 100644 --- a/pr_agent/git_providers/bitbucket_provider.py +++ b/pr_agent/git_providers/bitbucket_provider.py @@ -6,12 +6,11 @@ from urllib.parse import urlparse import requests from atlassian.bitbucket import Cloud -from ..algo.pr_processing import clip_tokens from ..config_loader import get_settings -from .git_provider import FilePatchInfo +from .git_provider import FilePatchInfo, GitProvider -class BitbucketProvider: +class BitbucketProvider(GitProvider): def __init__(self, pr_url: Optional[str] = None, incremental: Optional[bool] = False): s = requests.Session() s.headers['Authorization'] = f'Bearer {get_settings().get("BITBUCKET.BEARER_TOKEN", None)}' @@ -156,10 +155,7 @@ class BitbucketProvider: def get_pr_branch(self): return self.pr.source_branch - def get_pr_description(self): - max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None) - if max_tokens: - return clip_tokens(self.pr.description, max_tokens) + def get_pr_description_full(self): return self.pr.description def get_user_id(self): diff --git a/pr_agent/git_providers/git_provider.py b/pr_agent/git_providers/git_provider.py index 4d711a14..e7796d1e 100644 --- a/pr_agent/git_providers/git_provider.py +++ b/pr_agent/git_providers/git_provider.py @@ -82,9 +82,26 @@ class GitProvider(ABC): pass @abstractmethod - def get_pr_description(self): + def get_pr_description_full(self): pass + def get_pr_description(self): + from pr_agent.config_loader import get_settings + from pr_agent.algo.pr_processing import clip_tokens + max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None) + description = self.get_pr_description_full() + if max_tokens: + return clip_tokens(description, max_tokens) + return description + + def get_user_description(self): + description = (self.get_pr_description_full() or "").strip() + if not description.startswith("## PR Type"): + return description + if "## User Description:" not in description: + return "" + return description.split("## User Description:", 1)[1].strip() + @abstractmethod def get_issue_comments(self): pass diff --git a/pr_agent/git_providers/github_provider.py b/pr_agent/git_providers/github_provider.py index be0fa645..2e07e969 100644 --- a/pr_agent/git_providers/github_provider.py +++ b/pr_agent/git_providers/github_provider.py @@ -233,10 +233,7 @@ class GithubProvider(GitProvider): def get_pr_branch(self): return self.pr.head.ref - def get_pr_description(self): - max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None) - if max_tokens: - return clip_tokens(self.pr.body, max_tokens) + def get_pr_description_full(self): return self.pr.body def get_user_id(self): diff --git a/pr_agent/git_providers/gitlab_provider.py b/pr_agent/git_providers/gitlab_provider.py index ec6f236d..1d4d1680 100644 --- a/pr_agent/git_providers/gitlab_provider.py +++ b/pr_agent/git_providers/gitlab_provider.py @@ -299,10 +299,7 @@ class GitLabProvider(GitProvider): def get_pr_branch(self): return self.mr.source_branch - def get_pr_description(self): - max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None) - if max_tokens: - return clip_tokens(self.mr.description, max_tokens) + def get_pr_description_full(self): return self.mr.description def get_issue_comments(self): diff --git a/pr_agent/git_providers/local_git_provider.py b/pr_agent/git_providers/local_git_provider.py index a4f21969..59b97a85 100644 --- a/pr_agent/git_providers/local_git_provider.py +++ b/pr_agent/git_providers/local_git_provider.py @@ -158,7 +158,7 @@ class LocalGitProvider(GitProvider): def get_user_id(self): return -1 # Not used anywhere for the local provider, but required by the interface - def get_pr_description(self): + def get_pr_description_full(self): commits_diff = list(self.repo.iter_commits(self.target_branch_name + '..HEAD')) # Get the commit messages and concatenate commit_messages = " ".join([commit.message for commit in commits_diff]) diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index ce920efd..ef684f2c 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -24,6 +24,7 @@ extra_instructions = "" [pr_description] # /describe # publish_description_as_comment=false +keep_user_description=true extra_instructions = "" [pr_questions] # /ask # diff --git a/pr_agent/tools/pr_description.py b/pr_agent/tools/pr_description.py index d55dd55a..13898e8f 100644 --- a/pr_agent/tools/pr_description.py +++ b/pr_agent/tools/pr_description.py @@ -42,6 +42,8 @@ class PRDescription: "extra_instructions": get_settings().pr_description.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages() } + + self.user_description = self.git_provider.get_user_description() # Initialize the token handler self.token_handler = TokenHandler( @@ -145,6 +147,9 @@ class PRDescription: # Load the AI prediction data into a dictionary data = load_yaml(self.prediction.strip()) + if get_settings().pr_description.keep_user_description and self.user_description: + data["User Description"] = self.user_description + # Initialization pr_types = [] @@ -167,7 +172,7 @@ class PRDescription: # Iterate over the remaining dictionary items and append the key and value to 'pr_body' in a markdown format, # except for the items containing the word 'walkthrough' pr_body = "" - for key, value in data.items(): + for idx, (key, value) in enumerate(data.items()): pr_body += f"## {key}:\n" if 'walkthrough' in key.lower(): # for filename, description in value.items(): @@ -179,7 +184,9 @@ class PRDescription: # if the value is a list, join its items by comma if type(value) == list: value = ', '.join(v for v in value) - pr_body += f"{value}\n\n___\n" + pr_body += f"{value}\n" + if idx < len(data) - 1: + pr_body += "\n___\n" if get_settings().config.verbosity_level >= 2: logging.info(f"title:\n{title}\n{pr_body}") From b3749d08e2655e40977e4f14b904c584978994af Mon Sep 17 00:00:00 2001 From: zmeir Date: Sun, 20 Aug 2023 19:00:56 +0300 Subject: [PATCH 02/14] Set default configuration to false to allow users to opt-in --- pr_agent/settings/configuration.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index ef684f2c..a881c500 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -24,7 +24,7 @@ extra_instructions = "" [pr_description] # /describe # publish_description_as_comment=false -keep_user_description=true +keep_user_description=false extra_instructions = "" [pr_questions] # /ask # From 81c38f9646c43a068e7618c25723ce3d4fa39ec0 Mon Sep 17 00:00:00 2001 From: zmeir Date: Mon, 21 Aug 2023 09:22:58 +0300 Subject: [PATCH 03/14] Added type hints --- pr_agent/git_providers/git_provider.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pr_agent/git_providers/git_provider.py b/pr_agent/git_providers/git_provider.py index e7796d1e..9db0a5bd 100644 --- a/pr_agent/git_providers/git_provider.py +++ b/pr_agent/git_providers/git_provider.py @@ -82,10 +82,10 @@ class GitProvider(ABC): pass @abstractmethod - def get_pr_description_full(self): + def get_pr_description_full(self) -> str: pass - def get_pr_description(self): + def get_pr_description(self) -> str: from pr_agent.config_loader import get_settings from pr_agent.algo.pr_processing import clip_tokens max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None) @@ -94,7 +94,7 @@ class GitProvider(ABC): return clip_tokens(description, max_tokens) return description - def get_user_description(self): + def get_user_description(self) -> str: description = (self.get_pr_description_full() or "").strip() if not description.startswith("## PR Type"): return description From fb9335f4247514ec15c3ee1a4ce3a8200d4bf28b Mon Sep 17 00:00:00 2001 From: mrT23 Date: Mon, 21 Aug 2023 09:07:21 +0300 Subject: [PATCH 04/14] extended improve --- pr_agent/agent/pr_agent.py | 2 + pr_agent/algo/ai_handler.py | 2 +- pr_agent/algo/git_patch_processing.py | 2 +- pr_agent/algo/pr_processing.py | 100 +++++++- pr_agent/config_loader.py | 1 + pr_agent/settings/configuration.toml | 8 + .../settings/pr_code_suggestions_prompts.toml | 67 +++--- pr_agent/settings/pr_reviewer_prompts.toml | 4 +- .../pr_sort_code_suggestions_prompts.toml | 46 ++++ .../tools/pr_extended_code_suggestions.py | 215 ++++++++++++++++++ 10 files changed, 406 insertions(+), 41 deletions(-) create mode 100644 pr_agent/settings/pr_sort_code_suggestions_prompts.toml create mode 100644 pr_agent/tools/pr_extended_code_suggestions.py diff --git a/pr_agent/agent/pr_agent.py b/pr_agent/agent/pr_agent.py index 70121f3c..f722695c 100644 --- a/pr_agent/agent/pr_agent.py +++ b/pr_agent/agent/pr_agent.py @@ -11,6 +11,7 @@ from pr_agent.tools.pr_description import PRDescription from pr_agent.tools.pr_information_from_user import PRInformationFromUser from pr_agent.tools.pr_questions import PRQuestions from pr_agent.tools.pr_reviewer import PRReviewer +from pr_agent.tools.pr_extended_code_suggestions import PRExtendedCodeSuggestions from pr_agent.tools.pr_update_changelog import PRUpdateChangelog from pr_agent.tools.pr_config import PRConfig @@ -25,6 +26,7 @@ command2class = { "describe_pr": PRDescription, "improve": PRCodeSuggestions, "improve_code": PRCodeSuggestions, + "extended_improve": PRExtendedCodeSuggestions, "ask": PRQuestions, "ask_question": PRQuestions, "update_changelog": PRUpdateChangelog, diff --git a/pr_agent/algo/ai_handler.py b/pr_agent/algo/ai_handler.py index fb5f64fe..0bc879d1 100644 --- a/pr_agent/algo/ai_handler.py +++ b/pr_agent/algo/ai_handler.py @@ -55,7 +55,7 @@ class AiHandler: @retry(exceptions=(APIError, Timeout, TryAgain, AttributeError, RateLimitError), tries=OPENAI_RETRIES, delay=2, backoff=2, jitter=(1, 3)) - async def chat_completion(self, model: str, temperature: float, system: str, user: str): + async def chat_completion(self, model: str, system: str, user: str, temperature: float = 0.2): """ Performs a chat completion using the OpenAI ChatCompletion API. Retries in case of API errors or timeouts. diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py index 1aec0006..230df41e 100644 --- a/pr_agent/algo/git_patch_processing.py +++ b/pr_agent/algo/git_patch_processing.py @@ -176,7 +176,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: ... """ - patch_with_lines_str = f"## {file.filename}\n" + patch_with_lines_str = f"\n\n## {file.filename}\n" import re patch_lines = patch.splitlines() RE_HUNK_HEADER = re.compile( diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index adab9506..1003f456 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -57,7 +57,7 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) # generate a standard diff string, with patch extension - patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler, + patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff(pr_languages, token_handler, add_line_numbers_to_hunks) # if we are under the limit, return the full diff @@ -78,9 +78,9 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s return final_diff -def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler, - add_line_numbers_to_hunks: bool) -> \ - Tuple[list, int]: +def pr_generate_extended_diff(pr_languages: list, + token_handler: TokenHandler, + add_line_numbers_to_hunks: bool) -> Tuple[list, int, list]: """ Generate a standard diff string with patch extension, while counting the number of tokens used and applying diff minimization techniques if needed. @@ -90,13 +90,10 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler, files. - token_handler: An object of the TokenHandler class used for handling tokens in the context of the pull request. - add_line_numbers_to_hunks: A boolean indicating whether to add line numbers to the hunks in the diff. - - Returns: - - patches_extended: A list of extended patches for each file in the pull request. - - total_tokens: The total number of tokens used in the extended patches. """ total_tokens = token_handler.prompt_tokens # initial tokens patches_extended = [] + patches_extended_tokens = [] for lang in pr_languages: for file in lang['files']: original_file_content_str = file.base_file @@ -108,15 +105,16 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler, extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES) full_extended_patch = f"## {file.filename}\n\n{extended_patch}\n" - if add_line_numbers_to_hunks: + if add_line_numbers_to_hunks and PATCH_EXTRA_LINES > 0: full_extended_patch = convert_to_hunks_with_lines_numbers(extended_patch, file) patch_tokens = token_handler.count_tokens(full_extended_patch) file.tokens = patch_tokens total_tokens += patch_tokens + patches_extended_tokens.append(patch_tokens) patches_extended.append(full_extended_patch) - return patches_extended, total_tokens + return patches_extended, total_tokens, patches_extended_tokens def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, model: str, @@ -337,4 +335,84 @@ def clip_tokens(text: str, max_tokens: int) -> str: return clipped_text except Exception as e: logging.warning(f"Failed to clip tokens: {e}") - return text \ No newline at end of file + return text + + +def get_pr_multi_diffs(git_provider: GitProvider, + token_handler: TokenHandler, + model: str, + max_calls: int = 5) -> List[str]: + """ + Retrieves the diff files from a Git provider, sorts them by main language, and generates patches for each file. + The patches are split into multiple groups based on the maximum number of tokens allowed for the given model. + + Args: + git_provider (GitProvider): An object that provides access to Git provider APIs. + token_handler (TokenHandler): An object that handles tokens in the context of a pull request. + model (str): The name of the model. + max_calls (int, optional): The maximum number of calls to retrieve diff files. Defaults to 5. + + Returns: + List[str]: A list of final diff strings, split into multiple groups based on the maximum number of tokens allowed for the given model. + + Raises: + RateLimitExceededException: If the rate limit for the Git provider API is exceeded. + """ + try: + diff_files = git_provider.get_diff_files() + except RateLimitExceededException as e: + logging.error(f"Rate limit exceeded for git provider API. original message {e}") + raise + + # Sort files by main language + pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) + + # Sort files within each language group by tokens in descending order + sorted_files = [] + for lang in pr_languages: + sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True)) + + patches = [] + final_diff_list = [] + total_tokens = token_handler.prompt_tokens + call_number = 1 + for file in sorted_files: + if call_number > max_calls: + if get_settings().config.verbosity_level >= 2: + logging.info(f"Reached max calls ({max_calls})") + break + + original_file_content_str = file.base_file + new_file_content_str = file.head_file + patch = file.patch + if not patch: + continue + + # Remove delete-only hunks + patch = handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file.filename) + if patch is None: + continue + + patch = convert_to_hunks_with_lines_numbers(patch, file) + new_patch_tokens = token_handler.count_tokens(patch) + if patch and (total_tokens + new_patch_tokens > MAX_TOKENS[model] - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD): + final_diff = "\n".join(patches) + final_diff_list.append(final_diff) + patches = [] + total_tokens = token_handler.prompt_tokens + call_number += 1 + if get_settings().config.verbosity_level >= 2: + logging.info(f"Call number: {call_number}") + + if patch: + patches.append(patch) + total_tokens += new_patch_tokens + if get_settings().config.verbosity_level >= 2: + logging.info(f"Tokens: {total_tokens}, last filename: {file.filename}") + + # Add the last chunk + if patches: + final_diff = "\n".join(patches) + final_diff_list.append(final_diff) + + return final_diff_list diff --git a/pr_agent/config_loader.py b/pr_agent/config_loader.py index 3075e8dc..47edfd97 100644 --- a/pr_agent/config_loader.py +++ b/pr_agent/config_loader.py @@ -19,6 +19,7 @@ global_settings = Dynaconf( "settings/pr_questions_prompts.toml", "settings/pr_description_prompts.toml", "settings/pr_code_suggestions_prompts.toml", + "settings/pr_sort_code_suggestions_prompts.toml", "settings/pr_information_from_user_prompts.toml", "settings/pr_update_changelog_prompts.toml", "settings_prod/.secrets.toml" diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index ce920efd..00c9b453 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -32,6 +32,14 @@ extra_instructions = "" num_code_suggestions=4 extra_instructions = "" +[pr_extendeted_code_suggestions] # /extended_improve # +num_code_suggestions_per_chunk=8 +extra_instructions = "" +max_number_of_calls = 5 +rank_suggestions = true +final_clip_factor = 0.5 + + [pr_update_changelog] # /update_changelog # push_changelog_changes=false extra_instructions = "" diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml index 76a3cb6b..be90c840 100644 --- a/pr_agent/settings/pr_code_suggestions_prompts.toml +++ b/pr_agent/settings/pr_code_suggestions_prompts.toml @@ -1,19 +1,47 @@ [pr_code_suggestions_prompt] -system="""You are a language model called CodiumAI-PR-Code-Reviewer. -Your task is to provide meaningfull non-trivial code suggestions to improve the new code in a PR (the '+' lines). -- Try to give important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningfull code improvements, like performance, vulnerability, modularity, and best practices. -- Suggestions should refer only to the 'new hunk' code, and focus on improving the new added code lines, with '+'. +system="""You are a language model called PR-Code-Reviewer. +Your task is to provide meaningful actionable code suggestions, to improve the new code presented in a PR. + +Example PR Diff input: +' +## src/file1.py + +--new hunk-- +12 code line that already existed in the file... +13 code line that already existed in the file.... +14 +new code line added in the PR +15 code line that already existed in the file... +16 code line that already existed in the file... + +--old hunk-- + code line that already existed in the file... +-code line that was removed in the PR + code line that already existed in the file... + + +--new hunk-- +... +--old hunk-- +... + + +## src/file2.py +... +' + +Specific instructions: +- Focus on important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningful code improvements, like performance, vulnerability, modularity, and best practices. +- Suggestions should refer only to code from the '--new hunk--' sections, and focus on new lines of code (lines starting with '+'). - Provide the exact line number range (inclusive) for each issue. -- Assume there is additional code in the relevant file that is not included in the diff. +- Assume there is additional relevant code, that is not included in the diff. - Provide up to {{ num_code_suggestions }} code suggestions. -- Make sure not to provide suggestions repeating modifications already implemented in the new PR code (the '+' lines). -- Don't output line numbers in the 'improved code' snippets. +- Avoid making suggestions that have already been implemented in the PR code. For example, if you propose adding a docstring, type hint, or anything else, make sure it isn't already in the '--new hunk--' code. {%- if extra_instructions %} Extra instructions from the user: {{ extra_instructions }} -{% endif %} +{%- endif %} You must use the following JSON schema to format your answer: ```json @@ -30,39 +58,26 @@ You must use the following JSON schema to format your answer: }, "suggestion content": { "type": "string", - "description": "a concrete suggestion for meaningfully improving the new PR code." + "description": "a concrete suggestion for meaningfully improving the new PR code (lines from the '--new hunk--' sections, starting with '+')." }, "existing code": { "type": "string", - "description": "a code snippet showing authentic relevant code lines from a 'new hunk' section. It must be continuous, correctly formatted and indented, and without line numbers." + "description": "a code snippet showing the relevant code lines from a '--new hunk--' section. It must be continuous, correctly formatted and indented, and without line numbers." }, "relevant lines": { "type": "string", - "description": "the relevant lines in the 'new hunk' sections, in the format of 'start_line-end_line'. For example: '10-15'. They should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above." + "description": "the relevant lines from a '--new hunk--' section, in the format of 'start_line-end_line'. For example: '10-15'. They should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above." }, "improved code": { "type": "string", - "description": "a new code snippet that can be used to replace the relevant lines in 'new hunk' code. Replacement suggestions should be complete, correctly formatted and indented, and without line numbers." + "description": "a new code snippet that can be used to replace the relevant lines in '--new hunk--' code. Replacement suggestions should be complete, correctly formatted and indented, and without line numbers." } } } } ``` -Example input: -' -## src/file1.py ----new_hunk--- -``` -[new hunk code, annotated with line numbers] -``` ----old_hunk--- -``` -[old hunk code] -``` -... -' - +Don't output line numbers in the 'improved code' snippets. Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. """ diff --git a/pr_agent/settings/pr_reviewer_prompts.toml b/pr_agent/settings/pr_reviewer_prompts.toml index cdf7f731..b65d62e4 100644 --- a/pr_agent/settings/pr_reviewer_prompts.toml +++ b/pr_agent/settings/pr_reviewer_prompts.toml @@ -1,9 +1,9 @@ [pr_review_prompt] system="""You are CodiumAI-PR-Reviewer, a language model designed to review git pull requests. -Your task is to provide constructive and concise feedback for the PR, and also provide meaningfull code suggestions to improve the new PR code (the '+' lines). +Your task is to provide constructive and concise feedback for the PR, and also provide meaningful code suggestions to improve the new PR code (the '+' lines). {%- if num_code_suggestions > 0 %} - Provide up to {{ num_code_suggestions }} code suggestions. -- Try to focus on the most important suggestions, like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningfull code improvements, like performance, vulnerability, modularity, and best practices. +- Try to focus on the most important suggestions, like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningful code improvements, like performance, vulnerability, modularity, and best practices. - Suggestions should focus on improving the new added code lines. - Make sure not to provide suggestions repeating modifications already implemented in the new PR code (the '+' lines). {%- endif %} diff --git a/pr_agent/settings/pr_sort_code_suggestions_prompts.toml b/pr_agent/settings/pr_sort_code_suggestions_prompts.toml new file mode 100644 index 00000000..16b6e861 --- /dev/null +++ b/pr_agent/settings/pr_sort_code_suggestions_prompts.toml @@ -0,0 +1,46 @@ +[pr_sort_code_suggestions_prompt] +system=""" +""" + +user="""You are given a list of code suggestions to improve a PR: + +{{ suggestion_str|trim }} + + +Your task is to sort the code suggestions by their order of importance, and return a list with sorting order. +The sorting order is a list of pairs, where each pair contains the index of the suggestion in the original list. +Rank the suggestions based on their importance to improving the PR, with critical issues first and minor issues last. + +You must use the following YAML schema to format your answer: +```yaml +Sort Order: + type: array + maxItems: {{ suggestion_list|length }} + uniqueItems: true + items: + suggestion number: + type: integer + minimum: 1 + maximum: {{ suggestion_list|length }} + importance order: + type: integer + minimum: 1 + maximum: {{ suggestion_list|length }} +``` + +Example output: +```yaml +Sort Order: + - suggestion number: 1 + importance order: 2 + - suggestion number: 2 + importance order: 3 + - suggestion number: 3 + importance order: 1 +``` + +Make sure to output a valid YAML. Use multi-line block scalar ('|') if needed. +Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. +Response (should be a valid YAML, and nothing else): +```yaml +""" diff --git a/pr_agent/tools/pr_extended_code_suggestions.py b/pr_agent/tools/pr_extended_code_suggestions.py new file mode 100644 index 00000000..17f7b570 --- /dev/null +++ b/pr_agent/tools/pr_extended_code_suggestions.py @@ -0,0 +1,215 @@ +from typing import List +import copy +import json +import logging +import textwrap +from typing import Dict, Any + +import yaml +from jinja2 import Environment, StrictUndefined + +from pr_agent.algo.ai_handler import AiHandler +from pr_agent.algo.pr_processing import get_pr_multi_diffs, retry_with_fallback_models +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import try_fix_json +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import get_git_provider +from pr_agent.git_providers.git_provider import get_main_pr_language + + +class PRExtendedCodeSuggestions: + def __init__(self, pr_url: str, cli_mode=False, args: list = None): + + self.git_provider = get_git_provider()(pr_url) + self.main_language = get_main_pr_language( + self.git_provider.get_languages(), self.git_provider.get_files() + ) + + self.ai_handler = AiHandler() + self.patches_diff = None + self.prediction = None + self.cli_mode = cli_mode + self.vars = { + "title": self.git_provider.pr.title, + "branch": self.git_provider.get_pr_branch(), + "description": self.git_provider.get_pr_description(), + "language": self.main_language, + "diff": "", # empty diff for initial calculation + "num_code_suggestions": get_settings().pr_extendeted_code_suggestions.num_code_suggestions_per_chunk, + "extra_instructions": get_settings().pr_extendeted_code_suggestions.extra_instructions, + "commit_messages_str": self.git_provider.get_commit_messages(), + } + self.token_handler = TokenHandler(self.git_provider.pr, + self.vars, + get_settings().pr_code_suggestions_prompt.system, + get_settings().pr_code_suggestions_prompt.user) + + async def run(self): + logging.info('Generating code suggestions for PR...') + if get_settings().config.publish_output: + self.git_provider.publish_comment("Preparing review...", is_temporary=True) + data = await retry_with_fallback_models(self._prepare_prediction) + + if get_settings().pr_extendeted_code_suggestions.rank_suggestions: + logging.info('Ranking Suggestions...') + data['Code suggestions'] = await self.rank_suggestions(data['Code suggestions']) + + logging.info('Preparing PR review...') + if get_settings().config.publish_output: + logging.info('Pushing PR review...') + self.git_provider.remove_initial_comment() + logging.info('Pushing inline code comments...') + self.push_inline_code_suggestions(data) + + async def _prepare_prediction(self, model: str) -> dict: + logging.info('Getting PR diff...') + patches_diff_list = get_pr_multi_diffs(self.git_provider, self.token_handler, model, + max_calls=get_settings().pr_extendeted_code_suggestions.max_number_of_calls) + + logging.info('Getting multi AI predictions...') + prediction_list = [] + for i, patches_diff in enumerate(patches_diff_list): + logging.info(f"Processing chunk {i + 1} of {len(patches_diff_list)}") + self.patches_diff = patches_diff + prediction = await self._get_prediction(model) + prediction_list.append(prediction) + self.prediction_list = prediction_list + + data = {} + for prediction in prediction_list: + self.prediction = prediction + data_per_chunk = self._prepare_pr_code_suggestions() + if "Code suggestions" in data: + data["Code suggestions"].extend(data_per_chunk["Code suggestions"]) + else: + data.update(data_per_chunk) + self.data = data + return data + + async def _get_prediction(self, model: str): + variables = copy.deepcopy(self.vars) + variables["diff"] = self.patches_diff # update diff + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.system).render(variables) + user_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.user).render(variables) + if get_settings().config.verbosity_level >= 2: + logging.info(f"\nSystem prompt:\n{system_prompt}") + logging.info(f"\nUser prompt:\n{user_prompt}") + response, finish_reason = await self.ai_handler.chat_completion(model=model, temperature=0.2, + system=system_prompt, user=user_prompt) + + return response + + def _prepare_pr_code_suggestions(self) -> str: + review = self.prediction.strip() + try: + data = json.loads(review) + except json.decoder.JSONDecodeError: + if get_settings().config.verbosity_level >= 1: + logging.info(f"Could not parse json response: {review}") + data = try_fix_json(review, code_suggestions=True) + return data + + def push_inline_code_suggestions(self, data): + code_suggestions = [] + + if not data['Code suggestions']: + return self.git_provider.publish_comment('No suggestions found to improve this PR.') + + for d in data['Code suggestions']: + try: + if get_settings().config.verbosity_level >= 1: + logging.info(f"suggestion: {d}") + relevant_file = d['relevant file'].strip() + relevant_lines_str = d['relevant lines'].strip() + if ',' in relevant_lines_str: # handling 'relevant lines': '181, 190' or '178-184, 188-194' + relevant_lines_str = relevant_lines_str.split(',')[0] + relevant_lines_start = int(relevant_lines_str.split('-')[0]) # absolute position + relevant_lines_end = int(relevant_lines_str.split('-')[-1]) + content = d['suggestion content'] + new_code_snippet = d['improved code'] + + if new_code_snippet: + new_code_snippet = self.dedent_code(relevant_file, relevant_lines_start, new_code_snippet) + + body = f"**Suggestion:** {content}\n```suggestion\n" + new_code_snippet + "\n```" + code_suggestions.append({'body': body, 'relevant_file': relevant_file, + 'relevant_lines_start': relevant_lines_start, + 'relevant_lines_end': relevant_lines_end}) + except Exception: + if get_settings().config.verbosity_level >= 1: + logging.info(f"Could not parse suggestion: {d}") + + self.git_provider.publish_code_suggestions(code_suggestions) + + def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): + try: # dedent code snippet + self.diff_files = self.git_provider.diff_files if self.git_provider.diff_files \ + else self.git_provider.get_diff_files() + original_initial_line = None + for file in self.diff_files: + if file.filename.strip() == relevant_file: + original_initial_line = file.head_file.splitlines()[relevant_lines_start - 1] + break + if original_initial_line: + suggested_initial_line = new_code_snippet.splitlines()[0] + original_initial_spaces = len(original_initial_line) - len(original_initial_line.lstrip()) + suggested_initial_spaces = len(suggested_initial_line) - len(suggested_initial_line.lstrip()) + delta_spaces = original_initial_spaces - suggested_initial_spaces + if delta_spaces > 0: + new_code_snippet = textwrap.indent(new_code_snippet, delta_spaces * " ").rstrip('\n') + except Exception as e: + if get_settings().config.verbosity_level >= 1: + logging.info(f"Could not dedent code snippet for file {relevant_file}, error: {e}") + + return new_code_snippet + + async def rank_suggestions(self, data: List) -> List: + """ + Call a model to rank (sort) code suggestions based on their importance order. + + Args: + data (List): A list of code suggestions to be ranked. + + Returns: + List: The ranked list of code suggestions. + """ + + suggestion_list = [] + # remove invalid suggestions + for i, suggestion in enumerate(data): + if suggestion['existing code'] != suggestion['improved code']: + suggestion_list.append(suggestion) + + data_sorted = [[]] * len(suggestion_list) + + try: + suggestion_str = "" + for i, suggestion in enumerate(suggestion_list): + suggestion_str += f"suggestion {i + 1}: " + str(suggestion) + '\n\n' + + variables = {'suggestion_list': suggestion_list, 'suggestion_str': suggestion_str} + model = get_settings().config.model + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.system).render(variables) + user_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.user).render(variables) + if get_settings().config.verbosity_level >= 2: + logging.info(f"\nSystem prompt:\n{system_prompt}") + logging.info(f"\nUser prompt:\n{user_prompt}") + response, finish_reason = await self.ai_handler.chat_completion(model=model, system=system_prompt, user=user_prompt) + + sort_order = yaml.safe_load(response) + for s in sort_order['Sort Order']: + suggestion_number = s['suggestion number'] + importance_order = s['importance order'] + data_sorted[importance_order - 1] = suggestion_list[suggestion_number - 1] + + if get_settings().pr_extendeted_code_suggestions.final_clip_factor != 1: + new_len = int(0.5 + len(data_sorted) * get_settings().pr_extendeted_code_suggestions.final_clip_factor) + data_sorted = data_sorted[:new_len] + except Exception as e: + if get_settings().config.verbosity_level >= 1: + logging.info(f"Could not sort suggestions, error: {e}") + data_sorted = suggestion_list + + return data_sorted From b85679e5e44e212a1e3ea3a171492f85477cdb9a Mon Sep 17 00:00:00 2001 From: mrT23 Date: Tue, 22 Aug 2023 09:42:59 +0300 Subject: [PATCH 05/14] improve --extend --- pr_agent/agent/pr_agent.py | 2 - pr_agent/algo/utils.py | 3 +- pr_agent/cli.py | 20 +- pr_agent/servers/help.py | 2 +- pr_agent/settings/configuration.toml | 10 +- pr_agent/tools/pr_code_suggestions.py | 107 ++++++++- .../tools/pr_extended_code_suggestions.py | 215 ------------------ 7 files changed, 122 insertions(+), 237 deletions(-) delete mode 100644 pr_agent/tools/pr_extended_code_suggestions.py diff --git a/pr_agent/agent/pr_agent.py b/pr_agent/agent/pr_agent.py index f722695c..70121f3c 100644 --- a/pr_agent/agent/pr_agent.py +++ b/pr_agent/agent/pr_agent.py @@ -11,7 +11,6 @@ from pr_agent.tools.pr_description import PRDescription from pr_agent.tools.pr_information_from_user import PRInformationFromUser from pr_agent.tools.pr_questions import PRQuestions from pr_agent.tools.pr_reviewer import PRReviewer -from pr_agent.tools.pr_extended_code_suggestions import PRExtendedCodeSuggestions from pr_agent.tools.pr_update_changelog import PRUpdateChangelog from pr_agent.tools.pr_config import PRConfig @@ -26,7 +25,6 @@ command2class = { "describe_pr": PRDescription, "improve": PRCodeSuggestions, "improve_code": PRCodeSuggestions, - "extended_improve": PRExtendedCodeSuggestions, "ask": PRQuestions, "ask_question": PRQuestions, "update_changelog": PRUpdateChangelog, diff --git a/pr_agent/algo/utils.py b/pr_agent/algo/utils.py index 87e206cf..2139203f 100644 --- a/pr_agent/algo/utils.py +++ b/pr_agent/algo/utils.py @@ -247,7 +247,8 @@ def update_settings_from_args(args: List[str]) -> List[str]: arg = arg.strip('-').strip() vals = arg.split('=', 1) if len(vals) != 2: - logging.error(f'Invalid argument format: {arg}') + if len(vals) > 2: # --extended is a valid argument + logging.error(f'Invalid argument format: {arg}') other_args.append(arg) continue key, value = _fix_key_value(*vals) diff --git a/pr_agent/cli.py b/pr_agent/cli.py index 0f871041..01c1a7ec 100644 --- a/pr_agent/cli.py +++ b/pr_agent/cli.py @@ -19,13 +19,21 @@ For example: - cli.py --pr_url=... reflect Supported commands: -review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement. -ask / ask_question [question] - Ask a question about the PR. -describe / describe_pr - Modify the PR title and description based on the PR's contents. -improve / improve_code - Suggest improvements to the code in the PR as pull request comments ready to commit. -reflect - Ask the PR author questions about the PR. -update_changelog - Update the changelog based on the PR's contents. +-review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement. +-ask / ask_question [question] - Ask a question about the PR. + +-describe / describe_pr - Modify the PR title and description based on the PR's contents. + +-improve / improve_code - Suggest improvements to the code in the PR as pull request comments ready to commit. +Extended mode ('improve --extended') employs several calls, and provides a more thorough feedback + +-reflect - Ask the PR author questions about the PR. + +-update_changelog - Update the changelog based on the PR's contents. + + +Configuration: To edit any configuration parameter from 'configuration.toml', just add -config_path=. For example: 'python cli.py --pr_url=... review --pr_reviewer.extra_instructions="focus on the file: ..."' """) diff --git a/pr_agent/servers/help.py b/pr_agent/servers/help.py index 1ee7fc4d..0f3f3caa 100644 --- a/pr_agent/servers/help.py +++ b/pr_agent/servers/help.py @@ -1,7 +1,7 @@ commands_text = "> **/review [-i]**: Request a review of your Pull Request. For an incremental review, which only " \ "considers changes since the last review, include the '-i' option.\n" \ "> **/describe**: Modify the PR title and description based on the contents of the PR.\n" \ - "> **/improve**: Suggest improvements to the code in the PR. \n" \ + "> **/improve [--extended]**: Suggest improvements to the code in the PR. Extended mode employs several calls, and provides a more thorough feedback. \n" \ "> **/ask \\**: Pose a question about the PR.\n" \ "> **/update_changelog**: Update the changelog based on the PR's contents.\n\n" \ ">To edit any configuration parameter from **configuration.toml**, add --config_path=new_value\n" \ diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index 00c9b453..b1d19f97 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -31,14 +31,12 @@ extra_instructions = "" [pr_code_suggestions] # /improve # num_code_suggestions=4 extra_instructions = "" - -[pr_extendeted_code_suggestions] # /extended_improve # +rank_suggestions = false +# params for '/improve --extended' mode num_code_suggestions_per_chunk=8 -extra_instructions = "" +rank_extended_suggestions = true max_number_of_calls = 5 -rank_suggestions = true -final_clip_factor = 0.5 - +final_clip_factor = 0.9 [pr_update_changelog] # /update_changelog # push_changelog_changes=false diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py index ebb88b21..4dc2f400 100644 --- a/pr_agent/tools/pr_code_suggestions.py +++ b/pr_agent/tools/pr_code_suggestions.py @@ -2,11 +2,13 @@ import copy import json import logging import textwrap +from typing import List +import yaml from jinja2 import Environment, StrictUndefined from pr_agent.algo.ai_handler import AiHandler -from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models +from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, get_pr_multi_diffs from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import try_fix_json from pr_agent.config_loader import get_settings @@ -22,6 +24,13 @@ class PRCodeSuggestions: self.git_provider.get_languages(), self.git_provider.get_files() ) + # extended mode + self.is_extended = any(["extended" in arg for arg in args]) + if self.is_extended: + num_code_suggestions = get_settings().pr_code_suggestions.num_code_suggestions_per_chunk + else: + num_code_suggestions = get_settings().pr_code_suggestions.num_code_suggestions + self.ai_handler = AiHandler() self.patches_diff = None self.prediction = None @@ -32,7 +41,7 @@ class PRCodeSuggestions: "description": self.git_provider.get_pr_description(), "language": self.main_language, "diff": "", # empty diff for initial calculation - "num_code_suggestions": get_settings().pr_code_suggestions.num_code_suggestions, + "num_code_suggestions": num_code_suggestions, "extra_instructions": get_settings().pr_code_suggestions.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages(), } @@ -42,14 +51,22 @@ class PRCodeSuggestions: get_settings().pr_code_suggestions_prompt.user) async def run(self): - # assert type(self.git_provider) != BitbucketProvider, "Bitbucket is not supported for now" - logging.info('Generating code suggestions for PR...') if get_settings().config.publish_output: self.git_provider.publish_comment("Preparing review...", is_temporary=True) - await retry_with_fallback_models(self._prepare_prediction) + logging.info('Preparing PR review...') - data = self._prepare_pr_code_suggestions() + if not self.is_extended: + await retry_with_fallback_models(self._prepare_prediction) + data = self._prepare_pr_code_suggestions() + else: + data = await retry_with_fallback_models(self._prepare_prediction_extended) + + if (not self.is_extended and get_settings().pr_code_suggestions.rank_suggestions) or \ + (self.is_extended and get_settings().pr_code_suggestions.rank_extended_suggestions): + logging.info('Ranking Suggestions...') + data['Code suggestions'] = await self.rank_suggestions(data['Code suggestions']) + if get_settings().config.publish_output: logging.info('Pushing PR review...') self.git_provider.remove_initial_comment() @@ -145,3 +162,81 @@ class PRCodeSuggestions: return new_code_snippet + async def _prepare_prediction_extended(self, model: str) -> dict: + logging.info('Getting PR diff...') + patches_diff_list = get_pr_multi_diffs(self.git_provider, self.token_handler, model, + max_calls=get_settings().pr_code_suggestions.max_number_of_calls) + + logging.info('Getting multi AI predictions...') + prediction_list = [] + for i, patches_diff in enumerate(patches_diff_list): + logging.info(f"Processing chunk {i + 1} of {len(patches_diff_list)}") + self.patches_diff = patches_diff + prediction = await self._get_prediction(model) + prediction_list.append(prediction) + self.prediction_list = prediction_list + + data = {} + for prediction in prediction_list: + self.prediction = prediction + data_per_chunk = self._prepare_pr_code_suggestions() + if "Code suggestions" in data: + data["Code suggestions"].extend(data_per_chunk["Code suggestions"]) + else: + data.update(data_per_chunk) + self.data = data + return data + + async def rank_suggestions(self, data: List) -> List: + """ + Call a model to rank (sort) code suggestions based on their importance order. + + Args: + data (List): A list of code suggestions to be ranked. + + Returns: + List: The ranked list of code suggestions. + """ + + suggestion_list = [] + # remove invalid suggestions + for i, suggestion in enumerate(data): + if suggestion['existing code'] != suggestion['improved code']: + suggestion_list.append(suggestion) + + data_sorted = [[]] * len(suggestion_list) + + try: + suggestion_str = "" + for i, suggestion in enumerate(suggestion_list): + suggestion_str += f"suggestion {i + 1}: " + str(suggestion) + '\n\n' + + variables = {'suggestion_list': suggestion_list, 'suggestion_str': suggestion_str} + model = get_settings().config.model + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.system).render( + variables) + user_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.user).render(variables) + if get_settings().config.verbosity_level >= 2: + logging.info(f"\nSystem prompt:\n{system_prompt}") + logging.info(f"\nUser prompt:\n{user_prompt}") + response, finish_reason = await self.ai_handler.chat_completion(model=model, system=system_prompt, + user=user_prompt) + + sort_order = yaml.safe_load(response) + for s in sort_order['Sort Order']: + suggestion_number = s['suggestion number'] + importance_order = s['importance order'] + data_sorted[importance_order - 1] = suggestion_list[suggestion_number - 1] + + if get_settings().pr_extendeted_code_suggestions.final_clip_factor != 1: + new_len = int(0.5 + len(data_sorted) * get_settings().pr_extendeted_code_suggestions.final_clip_factor) + data_sorted = data_sorted[:new_len] + except Exception as e: + if get_settings().config.verbosity_level >= 1: + logging.info(f"Could not sort suggestions, error: {e}") + data_sorted = suggestion_list + + return data_sorted + + diff --git a/pr_agent/tools/pr_extended_code_suggestions.py b/pr_agent/tools/pr_extended_code_suggestions.py deleted file mode 100644 index 17f7b570..00000000 --- a/pr_agent/tools/pr_extended_code_suggestions.py +++ /dev/null @@ -1,215 +0,0 @@ -from typing import List -import copy -import json -import logging -import textwrap -from typing import Dict, Any - -import yaml -from jinja2 import Environment, StrictUndefined - -from pr_agent.algo.ai_handler import AiHandler -from pr_agent.algo.pr_processing import get_pr_multi_diffs, retry_with_fallback_models -from pr_agent.algo.token_handler import TokenHandler -from pr_agent.algo.utils import try_fix_json -from pr_agent.config_loader import get_settings -from pr_agent.git_providers import get_git_provider -from pr_agent.git_providers.git_provider import get_main_pr_language - - -class PRExtendedCodeSuggestions: - def __init__(self, pr_url: str, cli_mode=False, args: list = None): - - self.git_provider = get_git_provider()(pr_url) - self.main_language = get_main_pr_language( - self.git_provider.get_languages(), self.git_provider.get_files() - ) - - self.ai_handler = AiHandler() - self.patches_diff = None - self.prediction = None - self.cli_mode = cli_mode - self.vars = { - "title": self.git_provider.pr.title, - "branch": self.git_provider.get_pr_branch(), - "description": self.git_provider.get_pr_description(), - "language": self.main_language, - "diff": "", # empty diff for initial calculation - "num_code_suggestions": get_settings().pr_extendeted_code_suggestions.num_code_suggestions_per_chunk, - "extra_instructions": get_settings().pr_extendeted_code_suggestions.extra_instructions, - "commit_messages_str": self.git_provider.get_commit_messages(), - } - self.token_handler = TokenHandler(self.git_provider.pr, - self.vars, - get_settings().pr_code_suggestions_prompt.system, - get_settings().pr_code_suggestions_prompt.user) - - async def run(self): - logging.info('Generating code suggestions for PR...') - if get_settings().config.publish_output: - self.git_provider.publish_comment("Preparing review...", is_temporary=True) - data = await retry_with_fallback_models(self._prepare_prediction) - - if get_settings().pr_extendeted_code_suggestions.rank_suggestions: - logging.info('Ranking Suggestions...') - data['Code suggestions'] = await self.rank_suggestions(data['Code suggestions']) - - logging.info('Preparing PR review...') - if get_settings().config.publish_output: - logging.info('Pushing PR review...') - self.git_provider.remove_initial_comment() - logging.info('Pushing inline code comments...') - self.push_inline_code_suggestions(data) - - async def _prepare_prediction(self, model: str) -> dict: - logging.info('Getting PR diff...') - patches_diff_list = get_pr_multi_diffs(self.git_provider, self.token_handler, model, - max_calls=get_settings().pr_extendeted_code_suggestions.max_number_of_calls) - - logging.info('Getting multi AI predictions...') - prediction_list = [] - for i, patches_diff in enumerate(patches_diff_list): - logging.info(f"Processing chunk {i + 1} of {len(patches_diff_list)}") - self.patches_diff = patches_diff - prediction = await self._get_prediction(model) - prediction_list.append(prediction) - self.prediction_list = prediction_list - - data = {} - for prediction in prediction_list: - self.prediction = prediction - data_per_chunk = self._prepare_pr_code_suggestions() - if "Code suggestions" in data: - data["Code suggestions"].extend(data_per_chunk["Code suggestions"]) - else: - data.update(data_per_chunk) - self.data = data - return data - - async def _get_prediction(self, model: str): - variables = copy.deepcopy(self.vars) - variables["diff"] = self.patches_diff # update diff - environment = Environment(undefined=StrictUndefined) - system_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.system).render(variables) - user_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.user).render(variables) - if get_settings().config.verbosity_level >= 2: - logging.info(f"\nSystem prompt:\n{system_prompt}") - logging.info(f"\nUser prompt:\n{user_prompt}") - response, finish_reason = await self.ai_handler.chat_completion(model=model, temperature=0.2, - system=system_prompt, user=user_prompt) - - return response - - def _prepare_pr_code_suggestions(self) -> str: - review = self.prediction.strip() - try: - data = json.loads(review) - except json.decoder.JSONDecodeError: - if get_settings().config.verbosity_level >= 1: - logging.info(f"Could not parse json response: {review}") - data = try_fix_json(review, code_suggestions=True) - return data - - def push_inline_code_suggestions(self, data): - code_suggestions = [] - - if not data['Code suggestions']: - return self.git_provider.publish_comment('No suggestions found to improve this PR.') - - for d in data['Code suggestions']: - try: - if get_settings().config.verbosity_level >= 1: - logging.info(f"suggestion: {d}") - relevant_file = d['relevant file'].strip() - relevant_lines_str = d['relevant lines'].strip() - if ',' in relevant_lines_str: # handling 'relevant lines': '181, 190' or '178-184, 188-194' - relevant_lines_str = relevant_lines_str.split(',')[0] - relevant_lines_start = int(relevant_lines_str.split('-')[0]) # absolute position - relevant_lines_end = int(relevant_lines_str.split('-')[-1]) - content = d['suggestion content'] - new_code_snippet = d['improved code'] - - if new_code_snippet: - new_code_snippet = self.dedent_code(relevant_file, relevant_lines_start, new_code_snippet) - - body = f"**Suggestion:** {content}\n```suggestion\n" + new_code_snippet + "\n```" - code_suggestions.append({'body': body, 'relevant_file': relevant_file, - 'relevant_lines_start': relevant_lines_start, - 'relevant_lines_end': relevant_lines_end}) - except Exception: - if get_settings().config.verbosity_level >= 1: - logging.info(f"Could not parse suggestion: {d}") - - self.git_provider.publish_code_suggestions(code_suggestions) - - def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): - try: # dedent code snippet - self.diff_files = self.git_provider.diff_files if self.git_provider.diff_files \ - else self.git_provider.get_diff_files() - original_initial_line = None - for file in self.diff_files: - if file.filename.strip() == relevant_file: - original_initial_line = file.head_file.splitlines()[relevant_lines_start - 1] - break - if original_initial_line: - suggested_initial_line = new_code_snippet.splitlines()[0] - original_initial_spaces = len(original_initial_line) - len(original_initial_line.lstrip()) - suggested_initial_spaces = len(suggested_initial_line) - len(suggested_initial_line.lstrip()) - delta_spaces = original_initial_spaces - suggested_initial_spaces - if delta_spaces > 0: - new_code_snippet = textwrap.indent(new_code_snippet, delta_spaces * " ").rstrip('\n') - except Exception as e: - if get_settings().config.verbosity_level >= 1: - logging.info(f"Could not dedent code snippet for file {relevant_file}, error: {e}") - - return new_code_snippet - - async def rank_suggestions(self, data: List) -> List: - """ - Call a model to rank (sort) code suggestions based on their importance order. - - Args: - data (List): A list of code suggestions to be ranked. - - Returns: - List: The ranked list of code suggestions. - """ - - suggestion_list = [] - # remove invalid suggestions - for i, suggestion in enumerate(data): - if suggestion['existing code'] != suggestion['improved code']: - suggestion_list.append(suggestion) - - data_sorted = [[]] * len(suggestion_list) - - try: - suggestion_str = "" - for i, suggestion in enumerate(suggestion_list): - suggestion_str += f"suggestion {i + 1}: " + str(suggestion) + '\n\n' - - variables = {'suggestion_list': suggestion_list, 'suggestion_str': suggestion_str} - model = get_settings().config.model - environment = Environment(undefined=StrictUndefined) - system_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.system).render(variables) - user_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.user).render(variables) - if get_settings().config.verbosity_level >= 2: - logging.info(f"\nSystem prompt:\n{system_prompt}") - logging.info(f"\nUser prompt:\n{user_prompt}") - response, finish_reason = await self.ai_handler.chat_completion(model=model, system=system_prompt, user=user_prompt) - - sort_order = yaml.safe_load(response) - for s in sort_order['Sort Order']: - suggestion_number = s['suggestion number'] - importance_order = s['importance order'] - data_sorted[importance_order - 1] = suggestion_list[suggestion_number - 1] - - if get_settings().pr_extendeted_code_suggestions.final_clip_factor != 1: - new_len = int(0.5 + len(data_sorted) * get_settings().pr_extendeted_code_suggestions.final_clip_factor) - data_sorted = data_sorted[:new_len] - except Exception as e: - if get_settings().config.verbosity_level >= 1: - logging.info(f"Could not sort suggestions, error: {e}") - data_sorted = suggestion_list - - return data_sorted From 2b22f712fb9105c9a2b220c6d61db5e55615e41a Mon Sep 17 00:00:00 2001 From: zmeir Date: Tue, 22 Aug 2023 09:55:56 +0300 Subject: [PATCH 06/14] Renamed keep_user_description --> add_original_user_description --- pr_agent/settings/configuration.toml | 2 +- pr_agent/tools/pr_description.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index a881c500..d3098f4f 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -24,7 +24,7 @@ extra_instructions = "" [pr_description] # /describe # publish_description_as_comment=false -keep_user_description=false +add_original_user_description=false extra_instructions = "" [pr_questions] # /ask # diff --git a/pr_agent/tools/pr_description.py b/pr_agent/tools/pr_description.py index 13898e8f..98fbe0ce 100644 --- a/pr_agent/tools/pr_description.py +++ b/pr_agent/tools/pr_description.py @@ -147,7 +147,7 @@ class PRDescription: # Load the AI prediction data into a dictionary data = load_yaml(self.prediction.strip()) - if get_settings().pr_description.keep_user_description and self.user_description: + if get_settings().pr_description.add_original_user_description and self.user_description: data["User Description"] = self.user_description # Initialization From 09ef809080cd72051734d3fe94313efc71b272c1 Mon Sep 17 00:00:00 2001 From: zmeir Date: Tue, 22 Aug 2023 10:04:21 +0300 Subject: [PATCH 07/14] Added comments explaining the logic behind `get_user_description` --- pr_agent/git_providers/git_provider.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pr_agent/git_providers/git_provider.py b/pr_agent/git_providers/git_provider.py index 9db0a5bd..0837155c 100644 --- a/pr_agent/git_providers/git_provider.py +++ b/pr_agent/git_providers/git_provider.py @@ -96,10 +96,14 @@ class GitProvider(ABC): def get_user_description(self) -> str: description = (self.get_pr_description_full() or "").strip() + # if the existing description wasn't generated by the pr-agent, just return it as-is if not description.startswith("## PR Type"): return description + # if the existing description was generated by the pr-agent, but it doesn't contain the user description, + # return nothing (empty string) because it means there is no user description if "## User Description:" not in description: return "" + # otherwise, extract the original user description from the existing pr-agent description and return it return description.split("## User Description:", 1)[1].strip() @abstractmethod From 580af44e7dd26e0565e318c9130146f2eb551aa0 Mon Sep 17 00:00:00 2001 From: idavidov Date: Tue, 22 Aug 2023 10:24:20 +0300 Subject: [PATCH 08/14] Could we consider adding permissions to the GitHub Actions section? I've noticed that this has been a point of confusion for some users, as evidenced by questions in our Discord channel and GitHub issues. Some folks may even be discouraged to the point of not seeking help. I believe adding permissions could significantly improve the user experience. What are your thoughts? --- INSTALL.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/INSTALL.md b/INSTALL.md index 73848ade..4fed88a3 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -54,6 +54,10 @@ on: jobs: pr_agent_job: runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + contents: write name: Run pr agent on every pull request, respond to user comments steps: - name: PR Agent action step @@ -72,6 +76,10 @@ on: jobs: pr_agent_job: runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + contents: write name: Run pr agent on every pull request, respond to user comments steps: - name: PR Agent action step From 82fb611a265527de1761ee4b48da1e860cafbe36 Mon Sep 17 00:00:00 2001 From: zmeir Date: Tue, 22 Aug 2023 10:32:58 +0300 Subject: [PATCH 09/14] Add options to keep original user title --- pr_agent/settings/configuration.toml | 1 + pr_agent/tools/pr_description.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index d3098f4f..73b32878 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -25,6 +25,7 @@ extra_instructions = "" [pr_description] # /describe # publish_description_as_comment=false add_original_user_description=false +keep_original_user_title=false extra_instructions = "" [pr_questions] # /ask # diff --git a/pr_agent/tools/pr_description.py b/pr_agent/tools/pr_description.py index 98fbe0ce..440675fd 100644 --- a/pr_agent/tools/pr_description.py +++ b/pr_agent/tools/pr_description.py @@ -166,8 +166,14 @@ class PRDescription: elif type(data['PR Type']) == str: pr_types = data['PR Type'].split(',') - # Assign the value of the 'PR Title' key to 'title' variable and remove it from the dictionary - title = data.pop('PR Title') + # Remove the 'PR Title' key from the dictionary + ai_title = data.pop('PR Title') + if get_settings().pr_description.keep_original_user_title: + # Assign the original PR title to the 'title' variable + title = self.vars["title"] + else: + # Assign the value of the 'PR Title' key to 'title' variable + title = ai_title # Iterate over the remaining dictionary items and append the key and value to 'pr_body' in a markdown format, # except for the items containing the word 'walkthrough' From f4f040bf8d8443224f083d4cb544c206de9767fc Mon Sep 17 00:00:00 2001 From: mrT23 Date: Tue, 22 Aug 2023 16:11:51 +0300 Subject: [PATCH 10/14] publish each suggestion separably --- pr_agent/algo/git_patch_processing.py | 24 ++++++++++++------- pr_agent/algo/pr_processing.py | 10 ++++---- .../settings/pr_code_suggestions_prompts.toml | 24 ++++++++++--------- pr_agent/tools/pr_code_suggestions.py | 12 ++++++---- 4 files changed, 42 insertions(+), 28 deletions(-) diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py index 230df41e..1a2bd22b 100644 --- a/pr_agent/algo/git_patch_processing.py +++ b/pr_agent/algo/git_patch_processing.py @@ -1,5 +1,4 @@ from __future__ import annotations - import logging import re @@ -157,7 +156,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: example output: ## src/file.ts ---new hunk-- +__new hunk__ 881 line1 882 line2 883 line3 @@ -166,7 +165,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: 889 line6 890 line7 ... ---old hunk-- +__old hunk__ line1 line2 - line3 @@ -177,7 +176,6 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: """ patch_with_lines_str = f"\n\n## {file.filename}\n" - import re patch_lines = patch.splitlines() RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") @@ -185,23 +183,30 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: old_content_lines = [] match = None start1, size1, start2, size2 = -1, -1, -1, -1 + prev_header_line = [] + header_line =[] for line in patch_lines: if 'no newline at end of file' in line.lower(): continue if line.startswith('@@'): + header_line = line match = RE_HUNK_HEADER.match(line) if match and new_content_lines: # found a new hunk, split the previous lines if new_content_lines: - patch_with_lines_str += '\n--new hunk--\n' + if prev_header_line: + patch_with_lines_str += f'\n{prev_header_line}\n' + patch_with_lines_str += '__new hunk__\n' for i, line_new in enumerate(new_content_lines): patch_with_lines_str += f"{start2 + i} {line_new}\n" if old_content_lines: - patch_with_lines_str += '--old hunk--\n' + patch_with_lines_str += '__old hunk__\n' for line_old in old_content_lines: patch_with_lines_str += f"{line_old}\n" new_content_lines = [] old_content_lines = [] + if match: + prev_header_line = header_line try: start1, size1, start2, size2 = map(int, match.groups()[:4]) except: # '@@ -0,0 +1 @@' case @@ -219,12 +224,13 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: # finishing last hunk if match and new_content_lines: if new_content_lines: - patch_with_lines_str += '\n--new hunk--\n' + patch_with_lines_str += f'\n{header_line}\n' + patch_with_lines_str += '\n__new hunk__\n' for i, line_new in enumerate(new_content_lines): patch_with_lines_str += f"{start2 + i} {line_new}\n" if old_content_lines: - patch_with_lines_str += '\n--old hunk--\n' + patch_with_lines_str += '\n__old hunk__\n' for line_old in old_content_lines: patch_with_lines_str += f"{line_old}\n" - return patch_with_lines_str.strip() + return patch_with_lines_str.rstrip() diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 1003f456..29709d29 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -24,7 +24,7 @@ OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600 PATCH_EXTRA_LINES = 3 def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: str, - add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False) -> str: + add_line_numbers_to_hunks: bool = True, disable_extra_lines: bool = True) -> str: """ Returns a string with the diff of the pull request, applying diff minimization techniques if needed. @@ -103,9 +103,9 @@ def pr_generate_extended_diff(pr_languages: list, # extend each patch with extra lines of context extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES) - full_extended_patch = f"## {file.filename}\n\n{extended_patch}\n" + full_extended_patch = f"\n\n## {file.filename}\n\n{extended_patch}\n" - if add_line_numbers_to_hunks and PATCH_EXTRA_LINES > 0: + if add_line_numbers_to_hunks: full_extended_patch = convert_to_hunks_with_lines_numbers(extended_patch, file) patch_tokens = token_handler.count_tokens(full_extended_patch) @@ -322,7 +322,9 @@ def clip_tokens(text: str, max_tokens: int) -> str: Returns: str: The clipped string. """ - # We'll estimate the number of tokens by hueristically assuming 2.5 tokens per word + if not text: + return text + try: encoder = get_token_encoder() num_input_tokens = len(encoder.encode(text)) diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml index be90c840..e32ffd72 100644 --- a/pr_agent/settings/pr_code_suggestions_prompts.toml +++ b/pr_agent/settings/pr_code_suggestions_prompts.toml @@ -6,22 +6,23 @@ Example PR Diff input: ' ## src/file1.py ---new hunk-- +@@ -12,3 +12,5 @@ def func1(): +__new hunk__ 12 code line that already existed in the file... 13 code line that already existed in the file.... 14 +new code line added in the PR 15 code line that already existed in the file... 16 code line that already existed in the file... - ---old hunk-- +__old hunk__ code line that already existed in the file... -code line that was removed in the PR code line that already existed in the file... ---new hunk-- +@@ ... @@ def func2(): +__new hunk__ ... ---old hunk-- +__old hunk__ ... @@ -31,11 +32,12 @@ Example PR Diff input: Specific instructions: - Focus on important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningful code improvements, like performance, vulnerability, modularity, and best practices. -- Suggestions should refer only to code from the '--new hunk--' sections, and focus on new lines of code (lines starting with '+'). +- Suggestions should refer only to code from the '__new hunk__' sections, and focus on new lines of code (lines starting with '+'). - Provide the exact line number range (inclusive) for each issue. - Assume there is additional relevant code, that is not included in the diff. - Provide up to {{ num_code_suggestions }} code suggestions. -- Avoid making suggestions that have already been implemented in the PR code. For example, if you propose adding a docstring, type hint, or anything else, make sure it isn't already in the '--new hunk--' code. +- Avoid making suggestions that have already been implemented in the PR code. For example, if you propose adding a docstring, type hint, or anything else, make sure it isn't already in the '__new hunk__' code. +- Don't suggest to add docstring or type hints. {%- if extra_instructions %} @@ -58,19 +60,19 @@ You must use the following JSON schema to format your answer: }, "suggestion content": { "type": "string", - "description": "a concrete suggestion for meaningfully improving the new PR code (lines from the '--new hunk--' sections, starting with '+')." + "description": "a concrete suggestion for meaningfully improving the new PR code (lines from the '__new hunk__' sections, starting with '+')." }, "existing code": { "type": "string", - "description": "a code snippet showing the relevant code lines from a '--new hunk--' section. It must be continuous, correctly formatted and indented, and without line numbers." + "description": "a code snippet showing the relevant code lines from a '__new hunk__' section. It must be continuous, correctly formatted and indented, and without line numbers." }, "relevant lines": { "type": "string", - "description": "the relevant lines from a '--new hunk--' section, in the format of 'start_line-end_line'. For example: '10-15'. They should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above." + "description": "the relevant lines from a '__new hunk__' section, in the format of 'start_line-end_line'. For example: '10-15'. They should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above." }, "improved code": { "type": "string", - "description": "a new code snippet that can be used to replace the relevant lines in '--new hunk--' code. Replacement suggestions should be complete, correctly formatted and indented, and without line numbers." + "description": "a new code snippet that can be used to replace the relevant lines in '__new hunk__' code. Replacement suggestions should be complete, correctly formatted and indented, and without line numbers." } } } diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py index 4dc2f400..cc787f5e 100644 --- a/pr_agent/tools/pr_code_suggestions.py +++ b/pr_agent/tools/pr_code_suggestions.py @@ -70,7 +70,7 @@ class PRCodeSuggestions: if get_settings().config.publish_output: logging.info('Pushing PR review...') self.git_provider.remove_initial_comment() - logging.info('Pushing inline code comments...') + logging.info('Pushing inline code suggestions...') self.push_inline_code_suggestions(data) async def _prepare_prediction(self, model: str): @@ -138,7 +138,11 @@ class PRCodeSuggestions: if get_settings().config.verbosity_level >= 2: logging.info(f"Could not parse suggestion: {d}") - self.git_provider.publish_code_suggestions(code_suggestions) + is_successful = self.git_provider.publish_code_suggestions(code_suggestions) + if not is_successful: + logging.info("Failed to publish code suggestions, trying to publish each suggestion separately") + for code_suggestion in code_suggestions: + self.git_provider.publish_code_suggestions([code_suggestion]) def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): try: # dedent code snippet @@ -229,8 +233,8 @@ class PRCodeSuggestions: importance_order = s['importance order'] data_sorted[importance_order - 1] = suggestion_list[suggestion_number - 1] - if get_settings().pr_extendeted_code_suggestions.final_clip_factor != 1: - new_len = int(0.5 + len(data_sorted) * get_settings().pr_extendeted_code_suggestions.final_clip_factor) + if get_settings().pr_code_suggestions.final_clip_factor != 1: + new_len = int(0.5 + len(data_sorted) * get_settings().pr_code_suggestions.final_clip_factor) data_sorted = data_sorted[:new_len] except Exception as e: if get_settings().config.verbosity_level >= 1: From 36e5e5a17e559a340df961517f506d62fb41bd97 Mon Sep 17 00:00:00 2001 From: mrT23 Date: Tue, 22 Aug 2023 16:30:18 +0300 Subject: [PATCH 11/14] update --- pr_agent/settings/pr_code_suggestions_prompts.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml index e32ffd72..a65b546f 100644 --- a/pr_agent/settings/pr_code_suggestions_prompts.toml +++ b/pr_agent/settings/pr_code_suggestions_prompts.toml @@ -36,7 +36,7 @@ Specific instructions: - Provide the exact line number range (inclusive) for each issue. - Assume there is additional relevant code, that is not included in the diff. - Provide up to {{ num_code_suggestions }} code suggestions. -- Avoid making suggestions that have already been implemented in the PR code. For example, if you propose adding a docstring, type hint, or anything else, make sure it isn't already in the '__new hunk__' code. +- Avoid making suggestions that have already been implemented in the PR code. For example, if you want to add logs or change a variable to const, or anything else, make sure it isn't already in the '__new hunk__' code. - Don't suggest to add docstring or type hints. {%- if extra_instructions %} From 9157fa670e707ea6733d10821818a07132915c2c Mon Sep 17 00:00:00 2001 From: mrT23 Date: Tue, 22 Aug 2023 16:32:22 +0300 Subject: [PATCH 12/14] -> bool --- pr_agent/git_providers/bitbucket_provider.py | 2 +- pr_agent/git_providers/git_provider.py | 2 +- pr_agent/git_providers/github_provider.py | 2 +- pr_agent/git_providers/gitlab_provider.py | 2 +- pr_agent/git_providers/local_git_provider.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pr_agent/git_providers/bitbucket_provider.py b/pr_agent/git_providers/bitbucket_provider.py index 3596f4bf..85ead79d 100644 --- a/pr_agent/git_providers/bitbucket_provider.py +++ b/pr_agent/git_providers/bitbucket_provider.py @@ -36,7 +36,7 @@ class BitbucketProvider: except Exception: return "" - def publish_code_suggestions(self, code_suggestions: list): + def publish_code_suggestions(self, code_suggestions: list) -> bool: """ Publishes code suggestions as comments on the PR. """ diff --git a/pr_agent/git_providers/git_provider.py b/pr_agent/git_providers/git_provider.py index 4d711a14..c3be8bc3 100644 --- a/pr_agent/git_providers/git_provider.py +++ b/pr_agent/git_providers/git_provider.py @@ -54,7 +54,7 @@ class GitProvider(ABC): pass @abstractmethod - def publish_code_suggestions(self, code_suggestions: list): + def publish_code_suggestions(self, code_suggestions: list) -> bool: pass @abstractmethod diff --git a/pr_agent/git_providers/github_provider.py b/pr_agent/git_providers/github_provider.py index be0fa645..f400b92d 100644 --- a/pr_agent/git_providers/github_provider.py +++ b/pr_agent/git_providers/github_provider.py @@ -166,7 +166,7 @@ class GithubProvider(GitProvider): def publish_inline_comments(self, comments: list[dict]): self.pr.create_review(commit=self.last_commit_id, comments=comments) - def publish_code_suggestions(self, code_suggestions: list): + def publish_code_suggestions(self, code_suggestions: list) -> bool: """ Publishes code suggestions as comments on the PR. """ diff --git a/pr_agent/git_providers/gitlab_provider.py b/pr_agent/git_providers/gitlab_provider.py index ec6f236d..f1fa0119 100644 --- a/pr_agent/git_providers/gitlab_provider.py +++ b/pr_agent/git_providers/gitlab_provider.py @@ -195,7 +195,7 @@ class GitLabProvider(GitProvider): f'No relevant diff found for {relevant_file} {relevant_line_in_file}. Falling back to last diff.') return self.last_diff # fallback to last_diff if no relevant diff is found - def publish_code_suggestions(self, code_suggestions: list): + def publish_code_suggestions(self, code_suggestions: list) -> bool: for suggestion in code_suggestions: try: body = suggestion['body'] diff --git a/pr_agent/git_providers/local_git_provider.py b/pr_agent/git_providers/local_git_provider.py index a4f21969..cf8f38b2 100644 --- a/pr_agent/git_providers/local_git_provider.py +++ b/pr_agent/git_providers/local_git_provider.py @@ -130,7 +130,7 @@ class LocalGitProvider(GitProvider): relevant_lines_start: int, relevant_lines_end: int): raise NotImplementedError('Publishing code suggestions is not implemented for the local git provider') - def publish_code_suggestions(self, code_suggestions: list): + def publish_code_suggestions(self, code_suggestions: list) -> bool: raise NotImplementedError('Publishing code suggestions is not implemented for the local git provider') def publish_labels(self, labels): From d31b66b65603f04d7e9858fa73292d2cec000f19 Mon Sep 17 00:00:00 2001 From: Phill Zarfos Date: Tue, 22 Aug 2023 17:15:11 -0400 Subject: [PATCH 13/14] initial implementation of CodeCommit --- INSTALL.md | 68 ++++ README.md | 42 +- pr_agent/git_providers/__init__.py | 2 + pr_agent/git_providers/codecommit_client.py | 203 ++++++++++ pr_agent/git_providers/codecommit_provider.py | 363 ++++++++++++++++++ pyproject.toml | 3 +- requirements.txt | 1 + tests/unittest/test_codecommit_client.py | 136 +++++++ tests/unittest/test_codecommit_provider.py | 119 ++++++ 9 files changed, 915 insertions(+), 22 deletions(-) create mode 100644 pr_agent/git_providers/codecommit_client.py create mode 100644 pr_agent/git_providers/codecommit_provider.py create mode 100644 tests/unittest/test_codecommit_client.py create mode 100644 tests/unittest/test_codecommit_provider.py diff --git a/INSTALL.md b/INSTALL.md index 4fed88a3..303b3217 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -255,3 +255,71 @@ docker push codiumai/pr-agent:github_app # Push to your Docker repository 5. Configure the lambda function to have a Function URL. 6. Go back to steps 8-9 of [Method 5](#method-5-run-as-a-github-app) with the function url as your Webhook URL. The Webhook URL would look like `https:///api/v1/github_webhooks` + +--- + +#### AWS CodeCommit Setup + +Not all features have been added to CodeCommit yet. As of right now, CodeCommit has been implemented to run the pr-agent CLI on the command line, using AWS credentials stored in environment variables. (More features will be added in the future.) The following is a set of instructions to have pr-agent do a review of your CodeCommit pull request from the command line: + +1. Create an IAM user that you will use to read CodeCommit pull requests and post comments + * Note: That user should have CLI access only, not Console access +2. Add IAM permissions to that user, to allow access to CodeCommit (see IAM Role example below) +3. Generate an Access Key for your IAM user +4. Set the Access Key and Secret using environment variables (see Access Key example below) +5. Set the `git_provider` value to `codecommit` in the `pr_agent/settings/configuration.toml` settings file +6. Set the `PYTHONPATH` to include your `pr-agent` project directory + * Option A: Add `PYTHONPATH="/PATH/TO/PROJECTS/pr-agent` to your `.env` file + * Option B: Set `PYTHONPATH` and run the CLI in one command, for example: + * `PYTHONPATH="/PATH/TO/PROJECTS/pr-agent python pr_agent/cli.py [--ARGS]` + +#### AWS CodeCommit IAM Role Example + +Example IAM permissions to that user to allow access to CodeCommit: + +* Note: The following is a working example of IAM permissions that has read access to the repositories and write access to allow posting comments +* Note: If you only want pr-agent to review your pull requests, you can tighten the IAM permissions further, however this IAM example will work, and allow the pr-agent to post comments to the PR +* Note: You may want to replace the `"Resource": "*"` with your list of repos, to limit access to only those repos + +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "codecommit:BatchDescribe*", + "codecommit:BatchGet*", + "codecommit:Describe*", + "codecommit:EvaluatePullRequestApprovalRules", + "codecommit:Get*", + "codecommit:List*", + "codecommit:PostComment*", + "codecommit:PutCommentReaction" + ], + "Resource": "*" + } + ] +} +``` + +#### AWS CodeCommit Access Key and Secret + +Example setting the Access Key and Secret using environment variables + +```sh +export AWS_ACCESS_KEY_ID="XXXXXXXXXXXXXXXX" +export AWS_SECRET_ACCESS_KEY="XXXXXXXXXXXXXXXX" +export AWS_DEFAULT_REGION="us-east-1" +``` + +#### AWS CodeCommit CLI Example + +After you set up AWS CodeCommit using the instructions above, here is an example CLI run that tells pr-agent to **review** a given pull request. +(Replace your specific PYTHONPATH and PR URL in the example) + +```sh +PYTHONPATH="/PATH/TO/PROJECTS/pr-agent" python pr_agent/cli.py \ + --pr_url https://us-east-1.console.aws.amazon.com/codesuite/codecommit/repositories/MY_REPO_NAME/pull-requests/321 \ + review +``` diff --git a/README.md b/README.md index d2c4e171..1b120241 100644 --- a/README.md +++ b/README.md @@ -75,26 +75,26 @@ CodiumAI `PR-Agent` is an open-source tool aiming to help developers review pull ## Overview `PR-Agent` offers extensive pull request functionalities across various git providers: -| | | GitHub | Gitlab | Bitbucket | -|-------|---------------------------------------------|:------:|:------:|:---------:| -| TOOLS | Review | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | ⮑ Inline review | :white_check_mark: | :white_check_mark: | | -| | Ask | :white_check_mark: | :white_check_mark: | :white_check_mark: -| | Auto-Description | :white_check_mark: | :white_check_mark: | | -| | Improve Code | :white_check_mark: | :white_check_mark: | | -| | Reflect and Review | :white_check_mark: | | | -| | Update CHANGELOG.md | :white_check_mark: | | | -| | | | | | -| USAGE | CLI | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | App / webhook | :white_check_mark: | :white_check_mark: | | -| | Tagging bot | :white_check_mark: | | | -| | Actions | :white_check_mark: | | | -| | | | | | -| CORE | PR compression | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | Repo language prioritization | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | Adaptive and token-aware
file patch fitting | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | Multiple models support | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| | Incremental PR Review | :white_check_mark: | | | +| | | GitHub | Gitlab | Bitbucket | CodeCommit | +|-------|---------------------------------------------|:------:|:------:|:---------:|:----------:| +| TOOLS | Review | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| | ⮑ Inline review | :white_check_mark: | :white_check_mark: | | | +| | Ask | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| | Auto-Description | :white_check_mark: | :white_check_mark: | | | +| | Improve Code | :white_check_mark: | :white_check_mark: | | | +| | Reflect and Review | :white_check_mark: | | | | +| | Update CHANGELOG.md | :white_check_mark: | | | | +| | | | | | | +| USAGE | CLI | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| | App / webhook | :white_check_mark: | :white_check_mark: | | | +| | Tagging bot | :white_check_mark: | | | | +| | Actions | :white_check_mark: | | | | +| | | | | | | +| CORE | PR compression | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| | Repo language prioritization | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| | Adaptive and token-aware
file patch fitting | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| | Multiple models support | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| | Incremental PR Review | :white_check_mark: | | | | Examples for invoking the different tools via the CLI: - **Review**: python cli.py --pr_url= review @@ -153,7 +153,7 @@ Here are some advantages of PR-Agent: - We emphasize **real-life practical usage**. Each tool (review, improve, ask, ...) has a single GPT-4 call, no more. We feel that this is critical for realistic team usage - obtaining an answer quickly (~30 seconds) and affordably. - Our [PR Compression strategy](./PR_COMPRESSION.md) is a core ability that enables to effectively tackle both short and long PRs. - Our JSON prompting strategy enables to have **modular, customizable tools**. For example, the '/review' tool categories can be controlled via the [configuration](./CONFIGURATION.md) file. Adding additional categories is easy and accessible. -- We support **multiple git providers** (GitHub, Gitlab, Bitbucket), **multiple ways** to use the tool (CLI, GitHub Action, GitHub App, Docker, ...), and **multiple models** (GPT-4, GPT-3.5, Anthropic, Cohere, Llama2). +- We support **multiple git providers** (GitHub, Gitlab, Bitbucket, CodeCommit), **multiple ways** to use the tool (CLI, GitHub Action, GitHub App, Docker, ...), and **multiple models** (GPT-4, GPT-3.5, Anthropic, Cohere, Llama2). - We are open-source, and welcome contributions from the community. diff --git a/pr_agent/git_providers/__init__.py b/pr_agent/git_providers/__init__.py index e7c2aa0f..dddf58c8 100644 --- a/pr_agent/git_providers/__init__.py +++ b/pr_agent/git_providers/__init__.py @@ -1,5 +1,6 @@ from pr_agent.config_loader import get_settings from pr_agent.git_providers.bitbucket_provider import BitbucketProvider +from pr_agent.git_providers.codecommit_provider import CodeCommitProvider from pr_agent.git_providers.github_provider import GithubProvider from pr_agent.git_providers.gitlab_provider import GitLabProvider from pr_agent.git_providers.local_git_provider import LocalGitProvider @@ -8,6 +9,7 @@ _GIT_PROVIDERS = { 'github': GithubProvider, 'gitlab': GitLabProvider, 'bitbucket': BitbucketProvider, + 'codecommit': CodeCommitProvider, 'local' : LocalGitProvider } diff --git a/pr_agent/git_providers/codecommit_client.py b/pr_agent/git_providers/codecommit_client.py new file mode 100644 index 00000000..c1cfa763 --- /dev/null +++ b/pr_agent/git_providers/codecommit_client.py @@ -0,0 +1,203 @@ +import boto3 +import botocore + + +class CodeCommitDifferencesResponse: + """ + CodeCommitDifferencesResponse is the response object returned from our get_differences() function. + It maps the JSON response to member variables of this class. + """ + + def __init__(self, json: dict): + before_blob = json.get("beforeBlob", {}) + after_blob = json.get("afterBlob", {}) + + self.before_blob_id = before_blob.get("blobId", "") + self.before_blob_path = before_blob.get("path", "") + self.after_blob_id = after_blob.get("blobId", "") + self.after_blob_path = after_blob.get("path", "") + self.change_type = json.get("changeType", "") + + +class CodeCommitPullRequestResponse: + """ + CodeCommitPullRequestResponse is the response object returned from our get_pr() function. + It maps the JSON response to member variables of this class. + """ + + def __init__(self, json: dict): + self.title = json.get("title", "") + self.description = json.get("description", "") + + self.targets = [] + for target in json.get("pullRequestTargets", []): + self.targets.append(CodeCommitPullRequestResponse.CodeCommitPullRequestTarget(target)) + + class CodeCommitPullRequestTarget: + """ + CodeCommitPullRequestTarget is a subclass of CodeCommitPullRequestResponse that + holds details about an individual target commit. + """ + + def __init__(self, json: dict): + self.source_commit = json.get("sourceCommit", "") + self.source_branch = json.get("sourceReference", "") + self.destination_commit = json.get("destinationCommit", "") + self.destination_branch = json.get("destinationReference", "") + + +class CodeCommitClient: + """ + CodeCommitClient is a wrapper around the AWS boto3 SDK for the CodeCommit client + """ + + def __init__(self): + self.boto_client = None + + def _connect_boto_client(self): + try: + self.boto_client = boto3.client("codecommit") + except Exception as e: + raise ValueError(f"Failed to connect to AWS CodeCommit: {e}") + + def get_differences(self, repo_name: int, destination_commit: str, source_commit: str): + """ + Get the differences between two commits in CodeCommit. + + Parameters: + - repo_name: Name of the repository + - destination_commit: Commit hash you want to merge into (the "before" hash) (usually on the main or master branch) + - source_commit: Commit hash of the code you are adding (the "after" branch) + + Returns: + - List of CodeCommitDifferencesResponse objects + + Boto3 Documentation: + aws codecommit get-differences + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/get_differences.html + """ + if self.boto_client is None: + self._connect_boto_client() + + # The differences response from AWS is paginated, so we need to iterate through the pages to get all the differences. + differences = [] + try: + paginator = self.boto_client.get_paginator("get_differences") + for page in paginator.paginate( + repositoryName=repo_name, + beforeCommitSpecifier=destination_commit, + afterCommitSpecifier=source_commit, + ): + differences.extend(page.get("differences", [])) + except botocore.exceptions.ClientError as e: + raise ValueError(f"Failed to retrieve differences from CodeCommit PR #{self.pr_num}") from e + + output = [] + for json in differences: + output.append(CodeCommitDifferencesResponse(json)) + return output + + def get_file(self, repo_name: str, file_path: str, sha_hash: str, optional: bool = False): + """ + Retrieve a file from CodeCommit. + + Parameters: + - repo_name: Name of the repository + - file_path: Path to the file you are retrieving + - sha_hash: Commit hash of the file you are retrieving + + Returns: + - File contents + + Boto3 Documentation: + aws codecommit get_file + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/get_file.html + """ + if not file_path: + return "" + + if self.boto_client is None: + self._connect_boto_client() + + try: + response = self.boto_client.get_file(repositoryName=repo_name, commitSpecifier=sha_hash, filePath=file_path) + except botocore.exceptions.ClientError as e: + # if the file does not exist, but is flagged as optional, then return an empty string + if optional and e.response["Error"]["Code"] == 'FileDoesNotExistException': + return "" + raise ValueError(f"CodeCommit cannot retrieve file '{file_path}' from repository '{repo_name}'") from e + except Exception as e: + raise ValueError(f"CodeCommit cannot retrieve file '{file_path}' from repository '{repo_name}'") from e + if "fileContent" not in response: + raise ValueError(f"File content is empty for file: {file_path}") + + return response.get("fileContent", "") + + def get_pr(self, pr_number: int): + """ + Get a information about a CodeCommit PR. + + Parameters: + - pr_number: The PR number you are requesting + + Returns: + - CodeCommitPullRequestResponse object + + Boto3 Documentation: + aws codecommit get_pull_request + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/get_pull_request.html + """ + if self.boto_client is None: + self._connect_boto_client() + + try: + response = self.boto_client.get_pull_request(pullRequestId=str(pr_number)) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == 'PullRequestDoesNotExistException': + raise ValueError(f"CodeCommit cannot retrieve PR: PR number does not exist: {pr_number}") from e + raise ValueError(f"CodeCommit cannot retrieve PR: {pr_number}: boto client error") from e + except Exception as e: + raise ValueError(f"CodeCommit cannot retrieve PR: {pr_number}") from e + + if "pullRequest" not in response: + raise ValueError("CodeCommit PR number not found: {pr_number}") + + return CodeCommitPullRequestResponse(response.get("pullRequest", {})) + + def publish_comment(self, repo_name: str, pr_number: int, destination_commit: str, source_commit: str, comment: str): + """ + Publish a comment to a pull request + + Parameters: + - repo_name: name of the repository + - pr_number: number of the pull request + - destination_commit: The commit hash you want to merge into (the "before" hash) (usually on the main or master branch) + - source_commit: The commit hash of the code you are adding (the "after" branch) + - pr_comment: comment + + Returns: + - None + + Boto3 Documentation: + aws codecommit post_comment_for_pull_request + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/post_comment_for_pull_request.html + """ + if self.boto_client is None: + self._connect_boto_client() + + try: + self.boto_client.post_comment_for_pull_request( + pullRequestId=str(pr_number), + repositoryName=repo_name, + beforeCommitId=destination_commit, + afterCommitId=source_commit, + content=comment, + ) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == 'RepositoryDoesNotExistException': + raise ValueError(f"Repository does not exist: {repo_name}") from e + if e.response["Error"]["Code"] == 'PullRequestDoesNotExistException': + raise ValueError(f"PR number does not exist: {pr_number}") from e + raise ValueError(f"Boto3 client error calling post_comment_for_pull_request") from e + except Exception as e: + raise ValueError(f"Error calling post_comment_for_pull_request") from e diff --git a/pr_agent/git_providers/codecommit_provider.py b/pr_agent/git_providers/codecommit_provider.py new file mode 100644 index 00000000..a747e7f2 --- /dev/null +++ b/pr_agent/git_providers/codecommit_provider.py @@ -0,0 +1,363 @@ +import logging +import os +from collections import Counter +from typing import List, Optional, Tuple +from urllib.parse import urlparse + +from ..algo.language_handler import is_valid_file, language_extension_map +from ..algo.pr_processing import clip_tokens +from ..algo.utils import load_large_diff +from ..config_loader import get_settings +from .git_provider import EDIT_TYPE, FilePatchInfo, GitProvider, IncrementalPR +from pr_agent.git_providers.codecommit_client import CodeCommitClient + + +class PullRequestCCMimic: + """ + This class mimics the PullRequest class from the PyGithub library for the CodeCommitProvider. + """ + + def __init__(self, title: str, diff_files: List[FilePatchInfo]): + self.title = title + self.diff_files = diff_files + self.description = None + self.source_commit = None + self.source_branch = None # the branch containing your new code changes + self.destination_commit = None + self.destination_branch = None # the branch you are going to merge into + + +class CodeCommitFile: + """ + This class represents a file in a pull request in CodeCommit. + """ + + def __init__( + self, + a_path: str, + a_blob_id: str, + b_path: str, + b_blob_id: str, + edit_type: EDIT_TYPE, + ): + self.a_path = a_path + self.a_blob_id = a_blob_id + self.b_path = b_path + self.b_blob_id = b_blob_id + self.edit_type: EDIT_TYPE = edit_type + self.filename = b_path if b_path else a_path + + +class CodeCommitProvider(GitProvider): + """ + This class implements the GitProvider interface for AWS CodeCommit repositories. + """ + + def __init__(self, pr_url: Optional[str] = None, incremental: Optional[bool] = False): + self.codecommit_client = CodeCommitClient() + self.aws_client = None + self.repo_name = None + self.pr_num = None + self.pr = None + self.diff_files = None + self.git_files = None + if pr_url: + self.set_pr(pr_url) + + def provider_name(self): + return "CodeCommit" + + def is_supported(self, capability: str) -> bool: + if capability in [ + "get_issue_comments", + "create_inline_comment", + "publish_inline_comments", + "get_labels", + ]: + return False + return True + + def set_pr(self, pr_url: str): + self.repo_name, self.pr_num = self._parse_pr_url(pr_url) + self.pr = self._get_pr() + + def get_files(self) -> list[CodeCommitFile]: + # bring files from CodeCommit only once + if self.git_files: + return self.git_files + + self.git_files = [] + differences = self.codecommit_client.get_differences(self.repo_name, self.pr.destination_commit, self.pr.source_commit) + for item in differences: + self.git_files.append(CodeCommitFile(item.before_blob_path, + item.before_blob_id, + item.after_blob_path, + item.after_blob_id, + CodeCommitProvider._get_edit_type(item.change_type))) + return self.git_files + + def get_diff_files(self) -> list[FilePatchInfo]: + """ + Retrieves the list of files that have been modified, added, deleted, or renamed in a pull request in CodeCommit, + along with their content and patch information. + + Returns: + diff_files (List[FilePatchInfo]): List of FilePatchInfo objects representing the modified, added, deleted, + or renamed files in the merge request. + """ + # bring files from CodeCommit only once + if self.diff_files: + return self.diff_files + + self.diff_files = [] + + files = self.get_files() + for diff_item in files: + patch_filename = "" + if diff_item.a_blob_id is not None: + patch_filename = diff_item.a_path + original_file_content_str = self.codecommit_client.get_file( + self.repo_name, diff_item.a_path, self.pr.destination_commit) + if isinstance(original_file_content_str, (bytes, bytearray)): + original_file_content_str = original_file_content_str.decode("utf-8") + else: + original_file_content_str = "" + + if diff_item.b_blob_id is not None: + patch_filename = diff_item.b_path + new_file_content_str = self.codecommit_client.get_file(self.repo_name, diff_item.b_path, self.pr.source_commit) + if isinstance(new_file_content_str, (bytes, bytearray)): + new_file_content_str = new_file_content_str.decode("utf-8") + else: + new_file_content_str = "" + + patch = load_large_diff(patch_filename, new_file_content_str, original_file_content_str) + + # Store the diffs as a list of FilePatchInfo objects + info = FilePatchInfo( + original_file_content_str, + new_file_content_str, + patch, + diff_item.b_path, + edit_type=diff_item.edit_type, + old_filename=None + if diff_item.a_path == diff_item.b_path + else diff_item.a_path, + ) + # Only add valid files to the diff list + # "bad extensions" are set in the language_extensions.toml file + # a "valid file" is one that is not in the "bad extensions" list + if is_valid_file(info.filename): + self.diff_files.append(info) + + return self.diff_files + + def publish_description(self, pr_title: str, pr_body: str): + return "" # not implemented yet + + def publish_comment(self, pr_comment: str, is_temporary: bool = False): + if is_temporary: + logging.info(pr_comment) + return + + try: + self.codecommit_client.publish_comment( + repo_name=self.repo_name, + pr_number=str(self.pr_num), + destination_commit=self.pr.destination_commit, + source_commit=self.pr.source_commit, + comment=pr_comment, + ) + except Exception as e: + raise ValueError(f"CodeCommit Cannot post comment for PR: {self.pr_num}") from e + + def publish_code_suggestions(self, code_suggestions: list) -> bool: + return [""] # not implemented yet + + def publish_labels(self, labels): + return [""] # not implemented yet + + def get_labels(self): + return [""] # not implemented yet + + def remove_initial_comment(self): + return "" # not implemented yet + + def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): + raise NotImplementedError("CodeCommit provider does not support publishing inline comments yet") + + def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): + raise NotImplementedError("CodeCommit provider does not support creating inline comments yet") + + def publish_inline_comments(self, comments: list[dict]): + raise NotImplementedError("CodeCommit provider does not support publishing inline comments yet") + + def get_title(self): + return self.pr.get("title", "") + + def get_languages(self): + """ + Returns a dictionary of languages, containing the percentage of each language used in the PR. + + Returns: + dict: A dictionary where each key is a language name and the corresponding value is the percentage of that language in the PR. + """ + commit_files = self.get_files() + filenames = [ item.filename for item in commit_files ] + extensions = CodeCommitProvider._get_file_extensions(filenames) + + # Calculate the percentage of each file extension in the PR + percentages = CodeCommitProvider._get_language_percentages(extensions) + + # The global language_extension_map is a dictionary of languages, + # where each dictionary item is a BoxList of extensions. + # We want a dictionary of extensions, + # where each dictionary item is a language name. + # We build that language->extension dictionary here in main_extensions_flat. + main_extensions_flat = {} + for language, extensions in language_extension_map.items(): + for ext in extensions: + main_extensions_flat[ext] = language + + # Map the file extension/languages to percentages + languages = {} + for ext, pct in percentages.items(): + languages[main_extensions_flat.get(ext, "")] = pct + + return languages + + def get_pr_branch(self): + return self.pr.source_branch + + def get_pr_description_full(self) -> str: + return self.pr.description + + def get_user_id(self): + return -1 # not implemented yet + + def get_issue_comments(self): + raise NotImplementedError("CodeCommit provider does not support issue comments yet") + + def get_repo_settings(self): + # a local ".pr_agent.toml" settings file is optional + settings_filename = ".pr_agent.toml" + return self.codecommit_client.get_file(self.repo_name, settings_filename, self.pr.source_commit, optional=True) + + def add_eyes_reaction(self, issue_comment_id: int) -> Optional[int]: + return True + + def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: + return True + + @staticmethod + def _parse_pr_url(pr_url: str) -> Tuple[str, int]: + # Example PR URL: + # https://us-east-1.console.aws.amazon.com/codesuite/codecommit/repositories/__MY_REPO__/pull-requests/123456" + parsed_url = urlparse(pr_url) + + if "us-east-1.console.aws.amazon.com" not in parsed_url.netloc: + raise ValueError(f"The provided URL is not a valid CodeCommit URL: {pr_url}") + + path_parts = parsed_url.path.strip("/").split("/") + + if ( + len(path_parts) < 6 + or path_parts[0] != "codesuite" + or path_parts[1] != "codecommit" + or path_parts[2] != "repositories" + or path_parts[4] != "pull-requests" + ): + raise ValueError(f"The provided URL does not appear to be a CodeCommit PR URL: {pr_url}") + + repo_name = path_parts[3] + + try: + pr_number = int(path_parts[5]) + except ValueError as e: + raise ValueError(f"Unable to convert PR number to integer: '{path_parts[5]}'") from e + + return repo_name, pr_number + + def _get_pr(self): + response = self.codecommit_client.get_pr(self.pr_num) + + if len(response.targets) == 0: + raise ValueError(f"No files found in CodeCommit PR: {self.pr_num}") + + # TODO: implement support for multiple commits in one CodeCommit PR + # for now, we are only using the first commit in the PR + if len(response.targets) > 1: + logging.warning( + "Multiple commits in one PR is not supported for CodeCommit yet. Continuing, using the first commit only..." + ) + + # Return our object that mimics PullRequest class from the PyGithub library + # (This strategy was copied from the LocalGitProvider) + mimic = PullRequestCCMimic(response.title, self.diff_files) + mimic.description = response.description + mimic.source_commit = response.targets[0].source_commit + mimic.source_branch = response.targets[0].source_branch + mimic.destination_commit = response.targets[0].destination_commit + mimic.destination_branch = response.targets[0].destination_branch + + return mimic + + def get_commit_messages(self): + return "" # not implemented yet + + @staticmethod + def _get_edit_type(codecommit_change_type): + """ + Convert the CodeCommit change type string to the EDIT_TYPE enum. + The CodeCommit change type string is returned from the get_differences SDK method. + + Returns: + An EDIT_TYPE enum representing the modified, added, deleted, or renamed file in the PR diff. + """ + t = codecommit_change_type.upper() + edit_type = None + if t == "A": + edit_type = EDIT_TYPE.ADDED + elif t == "D": + edit_type = EDIT_TYPE.DELETED + elif t == "M": + edit_type = EDIT_TYPE.MODIFIED + elif t == "R": + edit_type = EDIT_TYPE.RENAMED + return edit_type + + @staticmethod + def _get_file_extensions(filenames): + """ + Return a list of file extensions from a list of filenames. + The returned extensions will include the dot "." prefix, + to accommodate for the dots in the existing language_extension_map settings. + Filenames with no extension will return an empty string for the extension. + """ + extensions = [] + for filename in filenames: + filename, ext = os.path.splitext(filename) + if ext: + extensions.append(ext.lower()) + else: + extensions.append("") + return extensions + + @staticmethod + def _get_language_percentages(extensions): + """ + Return a dictionary containing the programming language name (as the key), + and the percentage that language is used (as the value), + given a list of file extensions. + """ + total_files = len(extensions) + if total_files == 0: + return {} + + # Identify language by file extension and count + lang_count = Counter(extensions) + # Convert counts to percentages + lang_percentage = { + lang: round(count / total_files * 100) for lang, count in lang_count.items() + } + return lang_percentage diff --git a/pyproject.toml b/pyproject.toml index 2e8f2b5c..802cd0d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,8 @@ dependencies = [ "GitPython~=3.1.32", "starlette-context==0.3.6", "litellm~=0.1.351", - "PyYAML==6.0" + "PyYAML==6.0", + "boto3~=1.28.25" ] [project.urls] diff --git a/requirements.txt b/requirements.txt index 470fc6ef..f7af1669 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ litellm~=0.1.351 PyYAML==6.0 starlette-context==0.3.6 litellm~=0.1.351 +boto3~=1.28.25 diff --git a/tests/unittest/test_codecommit_client.py b/tests/unittest/test_codecommit_client.py new file mode 100644 index 00000000..5d09bdd1 --- /dev/null +++ b/tests/unittest/test_codecommit_client.py @@ -0,0 +1,136 @@ +from unittest.mock import MagicMock +from pr_agent.git_providers.codecommit_client import CodeCommitClient + + +class TestCodeCommitProvider: + def test_get_differences(self): + # Create a mock CodeCommitClient instance and codecommit_client member + api = CodeCommitClient() + api.boto_client = MagicMock() + + # Mock the response from the AWS client for get_differences method + api.boto_client.get_paginator.return_value.paginate.return_value = [ + { + "differences": [ + { + "beforeBlob": { + "path": "file1.py", + "blobId": "291b15c3ab4219e43a5f4f9091e5a97ee9d7400b", + }, + "afterBlob": { + "path": "file1.py", + "blobId": "46ad86582da03cc34c804c24b17976571bca1eba", + }, + "changeType": "M", + }, + { + "beforeBlob": {"path": "", "blobId": ""}, + "afterBlob": { + "path": "file2.py", + "blobId": "2404c7874fcbd684d6779c1420072f088647fd79", + }, + "changeType": "A", + }, + { + "beforeBlob": { + "path": "file3.py", + "blobId": "9af7989045ce40e9478ebb8089dfbadac19a9cde", + }, + "afterBlob": {"path": "", "blobId": ""}, + "changeType": "D", + }, + { + "beforeBlob": { + "path": "file5.py", + "blobId": "738e36eec120ef9d6393a149252698f49156d5b4", + }, + "afterBlob": { + "path": "file6.py", + "blobId": "faecdb85f7ba199df927a783b261378a1baeca85", + }, + "changeType": "R", + }, + ] + } + ] + + diffs = api.get_differences("my_test_repo", "commit1", "commit2") + + assert len(diffs) == 4 + assert diffs[0].before_blob_path == "file1.py" + assert diffs[0].before_blob_id == "291b15c3ab4219e43a5f4f9091e5a97ee9d7400b" + assert diffs[0].after_blob_path == "file1.py" + assert diffs[0].after_blob_id == "46ad86582da03cc34c804c24b17976571bca1eba" + assert diffs[0].change_type == "M" + assert diffs[1].before_blob_path == "" + assert diffs[1].before_blob_id == "" + assert diffs[1].after_blob_path == "file2.py" + assert diffs[1].after_blob_id == "2404c7874fcbd684d6779c1420072f088647fd79" + assert diffs[1].change_type == "A" + assert diffs[2].before_blob_path == "file3.py" + assert diffs[2].before_blob_id == "9af7989045ce40e9478ebb8089dfbadac19a9cde" + assert diffs[2].after_blob_path == "" + assert diffs[2].after_blob_id == "" + assert diffs[2].change_type == "D" + assert diffs[3].before_blob_path == "file5.py" + assert diffs[3].before_blob_id == "738e36eec120ef9d6393a149252698f49156d5b4" + assert diffs[3].after_blob_path == "file6.py" + assert diffs[3].after_blob_id == "faecdb85f7ba199df927a783b261378a1baeca85" + assert diffs[3].change_type == "R" + + def test_get_file(self): + # Create a mock CodeCommitClient instance and codecommit_client member + api = CodeCommitClient() + api.boto_client = MagicMock() + + # Mock the response from the AWS client for get_pull_request method + # def get_file(self, repo_name: str, file_path: str, sha_hash: str): + api.boto_client.get_file.return_value = { + "commitId": "6335d6d4496e8d50af559560997604bb03abc122", + "blobId": "c172209495d7968a8fdad76469564fb708460bc1", + "filePath": "requirements.txt", + "fileSize": 65, + "fileContent": b"boto3==1.28.25\ndynaconf==3.1.12\nfastapi==0.99.0\nPyGithub==1.59.*\n", + } + + repo_name = "my_test_repo" + file_path = "requirements.txt" + sha_hash = "84114a356ece1e5b7637213c8e486fea7c254656" + content = api.get_file(repo_name, file_path, sha_hash) + + assert len(content) == 65 + assert content == b"boto3==1.28.25\ndynaconf==3.1.12\nfastapi==0.99.0\nPyGithub==1.59.*\n" + assert content.decode("utf-8") == "boto3==1.28.25\ndynaconf==3.1.12\nfastapi==0.99.0\nPyGithub==1.59.*\n" + + def test_get_pr(self): + # Create a mock CodeCommitClient instance and codecommit_client member + api = CodeCommitClient() + api.boto_client = MagicMock() + + # Mock the response from the AWS client for get_pull_request method + api.boto_client.get_pull_request.return_value = { + "pullRequest": { + "pullRequestId": "3", + "title": "My PR", + "description": "My PR description", + "pullRequestTargets": [ + { + "sourceCommit": "commit1", + "sourceReference": "branch1", + "destinationCommit": "commit2", + "destinationReference": "branch2", + "repositoryName": "my_test_repo", + } + ], + } + } + + pr = api.get_pr(321) + + assert pr.title == "My PR" + assert pr.description == "My PR description" + assert len(pr.targets) == 1 + assert pr.targets[0].source_commit == "commit1" + assert pr.targets[0].source_branch == "branch1" + assert pr.targets[0].destination_commit == "commit2" + assert pr.targets[0].destination_branch == "branch2" diff --git a/tests/unittest/test_codecommit_provider.py b/tests/unittest/test_codecommit_provider.py new file mode 100644 index 00000000..e35f7250 --- /dev/null +++ b/tests/unittest/test_codecommit_provider.py @@ -0,0 +1,119 @@ +import pytest +from pr_agent.git_providers.codecommit_provider import CodeCommitFile +from pr_agent.git_providers.codecommit_provider import CodeCommitProvider +from pr_agent.git_providers.git_provider import EDIT_TYPE + + +class TestCodeCommitFile: + # Test that a CodeCommitFile object is created successfully with valid parameters. + # Generated by CodiumAI + def test_valid_parameters(self): + a_path = "path/to/file_a" + a_blob_id = "12345" + b_path = "path/to/file_b" + b_blob_id = "67890" + edit_type = EDIT_TYPE.ADDED + + file = CodeCommitFile(a_path, a_blob_id, b_path, b_blob_id, edit_type) + + assert file.a_path == a_path + assert file.a_blob_id == a_blob_id + assert file.b_path == b_path + assert file.b_blob_id == b_blob_id + assert file.edit_type == edit_type + assert file.filename == b_path + + +class TestCodeCommitProvider: + def test_parse_pr_url(self): + url = "https://us-east-1.console.aws.amazon.com/codesuite/codecommit/repositories/my_test_repo/pull-requests/321" + repo_name, pr_number = CodeCommitProvider._parse_pr_url(url) + assert repo_name == "my_test_repo" + assert pr_number == 321 + + # Test that an error is raised when an invalid CodeCommit URL is provided to the set_pr() method of the CodeCommitProvider class. + # Generated by CodiumAI + def test_invalid_codecommit_url(self): + provider = CodeCommitProvider() + with pytest.raises(ValueError): + provider.set_pr("https://example.com/codecommit/repositories/my_test_repo/pull-requests/4321") + + def test_get_file_extensions(self): + filenames = [ + "app.py", + "cli.py", + "composer.json", + "composer.lock", + "hello.py", + "image1.jpg", + "image2.JPG", + "index.js", + "provider.py", + "README", + "test.py", + ] + expected_extensions = [ + ".py", + ".py", + ".json", + ".lock", + ".py", + ".jpg", + ".jpg", + ".js", + ".py", + "", + ".py", + ] + extensions = CodeCommitProvider._get_file_extensions(filenames) + assert extensions == expected_extensions + + def test_get_language_percentages(self): + extensions = [ + ".py", + ".py", + ".json", + ".lock", + ".py", + ".jpg", + ".jpg", + ".js", + ".py", + "", + ".py", + ] + percentages = CodeCommitProvider._get_language_percentages(extensions) + assert percentages[".py"] == 45 + assert percentages[".json"] == 9 + assert percentages[".lock"] == 9 + assert percentages[".jpg"] == 18 + assert percentages[".js"] == 9 + assert percentages[""] == 9 + + # The _get_file_extensions function needs the "." prefix on the extension, + # but the _get_language_percentages function will work with or without the "." prefix + extensions = [ + "txt", + "py", + "py", + ] + percentages = CodeCommitProvider._get_language_percentages(extensions) + assert percentages["py"] == 67 + assert percentages["txt"] == 33 + + # test an empty list + percentages = CodeCommitProvider._get_language_percentages([]) + assert percentages == {} + + def test_get_edit_type(self): + assert CodeCommitProvider._get_edit_type("A") == EDIT_TYPE.ADDED + assert CodeCommitProvider._get_edit_type("D") == EDIT_TYPE.DELETED + assert CodeCommitProvider._get_edit_type("M") == EDIT_TYPE.MODIFIED + assert CodeCommitProvider._get_edit_type("R") == EDIT_TYPE.RENAMED + + assert CodeCommitProvider._get_edit_type("a") == EDIT_TYPE.ADDED + assert CodeCommitProvider._get_edit_type("d") == EDIT_TYPE.DELETED + assert CodeCommitProvider._get_edit_type("m") == EDIT_TYPE.MODIFIED + assert CodeCommitProvider._get_edit_type("r") == EDIT_TYPE.RENAMED + + assert CodeCommitProvider._get_edit_type("X") is None From 3a93dcd6a75cd8213d7e41c49108bc0400285fd6 Mon Sep 17 00:00:00 2001 From: Ori Kotek Date: Wed, 23 Aug 2023 00:37:04 +0300 Subject: [PATCH 14/14] Add build and test on pull request open, reopen --- .github/workflows/build-and-test.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build-and-test.yaml b/.github/workflows/build-and-test.yaml index 960da61b..114bbb7e 100644 --- a/.github/workflows/build-and-test.yaml +++ b/.github/workflows/build-and-test.yaml @@ -2,6 +2,8 @@ name: Build-and-test on: push: + pull_request: + types: [ opened, reopened ] jobs: build-and-test: