diff --git a/pr_agent/agent/pr_agent.py b/pr_agent/agent/pr_agent.py index 07c34c51..217ca192 100644 --- a/pr_agent/agent/pr_agent.py +++ b/pr_agent/agent/pr_agent.py @@ -6,6 +6,7 @@ import tempfile from pr_agent.algo.utils import update_settings_from_args from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider +from pr_agent.tools.pr_add_docs import PRAddDocs from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions from pr_agent.tools.pr_description import PRDescription from pr_agent.tools.pr_information_from_user import PRInformationFromUser @@ -32,6 +33,7 @@ command2class = { "config": PRConfig, "settings": PRConfig, "similar_issue": PRSimilarIssue, + "add_docs": PRAddDocs, } commands = list(command2class.keys()) diff --git a/pr_agent/config_loader.py b/pr_agent/config_loader.py index 47edfd97..184adb82 100644 --- a/pr_agent/config_loader.py +++ b/pr_agent/config_loader.py @@ -22,6 +22,7 @@ global_settings = Dynaconf( "settings/pr_sort_code_suggestions_prompts.toml", "settings/pr_information_from_user_prompts.toml", "settings/pr_update_changelog_prompts.toml", + "settings/pr_add_docs.toml", "settings_prod/.secrets.toml" ]] ) diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index aa28564c..ccd76060 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -4,7 +4,7 @@ fallback_models=["gpt-3.5-turbo-16k"] git_provider="github" publish_output=true publish_output_progress=true -verbosity_level=0 # 0,1,2 +verbosity_level=2 # 0,1,2 use_extra_bad_extensions=false use_repo_settings_file=true ai_timeout=180 @@ -47,6 +47,9 @@ rank_extended_suggestions = true max_number_of_calls = 5 final_clip_factor = 0.9 +[pr_add_docs_prompt] # /add_docs # +extra_instructions = "" + [pr_update_changelog] # /update_changelog # push_changelog_changes=false extra_instructions = "" diff --git a/pr_agent/settings/pr_add_docs.toml b/pr_agent/settings/pr_add_docs.toml new file mode 100644 index 00000000..e117447f --- /dev/null +++ b/pr_agent/settings/pr_add_docs.toml @@ -0,0 +1,122 @@ +[pr_add_docs_prompt] +system="""You are a language model called PR-Code-Documentation Agent, that specializes in documenting code. +Your task is to provide meaningfull {{ docs_for_language }} in a PR (the '+' lines). + +Example for a PR Diff input: +' +## src/file1.py + +@@ -12,3 +12,5 @@ def func1(): +__new hunk__ +12 code line that already existed in the file... +13 code line that already existed in the file.... +14 +new code line1 added in the PR +15 +new code line2 added in the PR +16 code line that already existed in the file... +__old hunk__ + code line that already existed in the file... +-code line that was removed in the PR + code line that already existed in the file... + + +@@ ... @@ def func2(): +__new hunk__ +... +__old hunk__ +... + + +## src/file2.py +... +' + +Specific instructions: +- Try to identify edited/added code components (classes/functions/methods...) that are undocumented, generate {{ docs_for_language }} for each one of the edited/added code components. +- If there are no edited/added code components, don't generate {{ docs_for_language }} for the edited/added code lines. +- If there are edited/added code components, but they are already documented, don't generate {{ docs_for_language }} for the edited/added code lines. +- Make sure the {{ docs_for_language }} starts and ends with standart {{ language }} {{ docs_for_language }} signs. +- The {{ docs_for_language }} should be in standard format. +- Execpt of the {{ docs_for_language }}, the new code should be identical to the original code snippet. Keep existing code comments, line comments, blank lines, formatting, etc. +- Documentation should refer only to the 'new hunk' code, and focus on improving the new added code lines, with '+'. +- Provide the exact line number range (inclusive) for each issue. +- Assume there is additional code in the relevant file that is not included in the diff. +- Don't output line numbers in the 'documented code' snippets. + + +{%- if extra_instructions %} + +Extra instructions from the user: +{{ extra_instructions }} +{%- endif %} + +You must use the following YAML schema to format your answer: +```yaml +Code suggestions: + type: array + uniqueItems: true + items: + relevant file: + type: string + description: the relevant file full path + existing code: + type: string + description: |- + a code snippet showing the relevant code lines from a '__new hunk__' section. + It must be contiguous, correctly formatted and indented, and without line numbers. + relevant lines start: + type: integer + description: |- + The relevant line number from a '__new hunk__' section where the suggestion starts (inclusive). + Should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above. + relevant lines end: + type: integer + description: |- + The relevant line number from a '__new hunk__' section where the suggestion ends (inclusive). + Should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above. + documented code: + type: string + description: |- + a new code snippet that can be used to replace the relevant lines in '__new hunk__' code. + {{ language }} {{ docs_for_language }} Replacement {{ language }} {{ docs_for_language }} should be complete, correctly formatted and indented, and without line numbers. +``` + +Example output: +```yaml +Code suggestions: + - relevant file: |- + src/file1.py + existing code: |- + def func1(): + relevant lines start: 12 + relevant lines end: 12 + documented code: |- + ... +``` + + +Each YAML output MUST be after a newline, indented, with block scalar indicator ('|-'). +Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. +""" + +user="""PR Info: + +Title: '{{title}}' + +Branch: '{{branch}}' + +Description: '{{description}}' + +{%- if language %} + +Main language: {{language}} +{%- endif %} + + +The PR Diff: +``` +{{- diff|trim }} +``` + +Response (should be a valid YAML, and nothing else): +```yaml +""" diff --git a/pr_agent/tools/pr_add_docs.py b/pr_agent/tools/pr_add_docs.py new file mode 100644 index 00000000..db06d2df --- /dev/null +++ b/pr_agent/tools/pr_add_docs.py @@ -0,0 +1,255 @@ +import copy +import logging +import textwrap +from typing import List, Dict +from jinja2 import Environment, StrictUndefined + +from pr_agent.algo.ai_handler import AiHandler +from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, get_pr_multi_diffs +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import load_yaml +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import BitbucketProvider, get_git_provider +from pr_agent.git_providers.git_provider import get_main_pr_language + + +def get_docs_for_language(language): + if language.lower() == 'java': + return "javadocs" + elif language.lower() in ['python', 'lisp', 'clojure']: + return "docstrings" + elif language.lower() in ['javascript', 'typescript']: + return "jsdocs" + elif language.lower() == 'c++': + return "doxygen" + else: + return "docs" + + +class PRAddDocs: + def __init__(self, pr_url: str, cli_mode=False, args: list = None): + + self.git_provider = get_git_provider()(pr_url) + self.main_language = get_main_pr_language( + self.git_provider.get_languages(), self.git_provider.get_files() + ) + + # extended mode + try: + self.is_extended = any(["extended" in arg for arg in args]) + except: + self.is_extended = False + + self.ai_handler = AiHandler() + self.patches_diff = None + self.prediction = None + self.cli_mode = cli_mode + self.vars = { + "title": self.git_provider.pr.title, + "branch": self.git_provider.get_pr_branch(), + "description": self.git_provider.get_pr_description(), + "language": self.main_language, + "diff": "", # empty diff for initial calculation + "extra_instructions": get_settings().pr_add_docs_prompt.extra_instructions, + "commit_messages_str": self.git_provider.get_commit_messages(), + 'docs_for_language': get_docs_for_language(self.main_language), + } + self.token_handler = TokenHandler(self.git_provider.pr, + self.vars, + get_settings().pr_add_docs_prompt.system, + get_settings().pr_add_docs_prompt.user) + + async def run(self): + try: + logging.info('Generating code Docs for PR...') + if get_settings().config.publish_output: + self.git_provider.publish_comment("Preparing review...", is_temporary=True) + + logging.info('Preparing PR review...') + if not self.is_extended: + await retry_with_fallback_models(self._prepare_prediction) + data = self._prepare_pr_code_suggestions() + else: + data = await retry_with_fallback_models(self._prepare_prediction_extended) + if (not data) or (not 'Code suggestions' in data): + logging.info('No code suggestions found for PR.') + return + + if (not self.is_extended and get_settings().pr_code_suggestions.rank_suggestions) or \ + (self.is_extended and get_settings().pr_code_suggestions.rank_extended_suggestions): + logging.info('Ranking Suggestions...') + data['Code suggestions'] = await self.rank_suggestions(data['Code suggestions']) + + if get_settings().config.publish_output: + logging.info('Pushing PR review...') + self.git_provider.remove_initial_comment() + logging.info('Pushing inline code suggestions...') + self.push_inline_code_suggestions(data) + except Exception as e: + logging.error(f"Failed to generate code suggestions for PR, error: {e}") + + async def _prepare_prediction(self, model: str): + logging.info('Getting PR diff...') + self.patches_diff = get_pr_diff(self.git_provider, + self.token_handler, + model, + add_line_numbers_to_hunks=True, + disable_extra_lines=True) + + logging.info('Getting AI prediction...') + self.prediction = await self._get_prediction(model) + + async def _get_prediction(self, model: str): + variables = copy.deepcopy(self.vars) + variables["diff"] = self.patches_diff # update diff + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_add_docs_prompt.system).render(variables) + user_prompt = environment.from_string(get_settings().pr_add_docs_prompt.user).render(variables) + if get_settings().config.verbosity_level >= 2: + logging.info(f"\nSystem prompt:\n{system_prompt}") + logging.info(f"\nUser prompt:\n{user_prompt}") + response, finish_reason = await self.ai_handler.chat_completion(model=model, temperature=0.2, + system=system_prompt, user=user_prompt) + + return response + + def _prepare_pr_code_suggestions(self) -> Dict: + review = self.prediction.strip() + data = load_yaml(review) + if isinstance(data, list): + data = {'Code suggestions': data} + return data + + def push_inline_code_suggestions(self, data): + code_suggestions = [] + + if not data['Code suggestions']: + return self.git_provider.publish_comment('No suggestions found to improve this PR.') + + for d in data['Code suggestions']: + try: + if get_settings().config.verbosity_level >= 2: + logging.info(f"suggestion: {d}") + relevant_file = d['relevant file'].strip() + relevant_lines_start = int(d['relevant lines start']) # absolute position + relevant_lines_end = int(d['relevant lines end']) + content = d['suggestion content'] + new_code_snippet = d['documented code'] + + if new_code_snippet: + new_code_snippet = self.dedent_code(relevant_file, relevant_lines_start, new_code_snippet) + + body = f"**Suggestion:** {content}\n```suggestion\n" + new_code_snippet + "\n```" + code_suggestions.append({'body': body, 'relevant_file': relevant_file, + 'relevant_lines_start': relevant_lines_start, + 'relevant_lines_end': relevant_lines_end}) + except Exception: + if get_settings().config.verbosity_level >= 2: + logging.info(f"Could not parse suggestion: {d}") + + is_successful = self.git_provider.publish_code_suggestions(code_suggestions) + if not is_successful: + logging.info("Failed to publish code suggestions, trying to publish each suggestion separately") + for code_suggestion in code_suggestions: + self.git_provider.publish_code_suggestions([code_suggestion]) + + def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): + try: # dedent code snippet + self.diff_files = self.git_provider.diff_files if self.git_provider.diff_files \ + else self.git_provider.get_diff_files() + original_initial_line = None + for file in self.diff_files: + if file.filename.strip() == relevant_file: + original_initial_line = file.head_file.splitlines()[relevant_lines_start - 1] + break + if original_initial_line: + suggested_initial_line = new_code_snippet.splitlines()[0] + original_initial_spaces = len(original_initial_line) - len(original_initial_line.lstrip()) + suggested_initial_spaces = len(suggested_initial_line) - len(suggested_initial_line.lstrip()) + delta_spaces = original_initial_spaces - suggested_initial_spaces + if delta_spaces > 0: + new_code_snippet = textwrap.indent(new_code_snippet, delta_spaces * " ").rstrip('\n') + except Exception as e: + if get_settings().config.verbosity_level >= 2: + logging.info(f"Could not dedent code snippet for file {relevant_file}, error: {e}") + + return new_code_snippet + + async def _prepare_prediction_extended(self, model: str) -> dict: + logging.info('Getting PR diff...') + patches_diff_list = get_pr_multi_diffs(self.git_provider, self.token_handler, model, + max_calls=get_settings().pr_code_suggestions.max_number_of_calls) + + logging.info('Getting multi AI predictions...') + prediction_list = [] + for i, patches_diff in enumerate(patches_diff_list): + logging.info(f"Processing chunk {i + 1} of {len(patches_diff_list)}") + self.patches_diff = patches_diff + prediction = await self._get_prediction(model) + prediction_list.append(prediction) + self.prediction_list = prediction_list + + data = {} + for prediction in prediction_list: + self.prediction = prediction + data_per_chunk = self._prepare_pr_code_suggestions() + if "Code suggestions" in data: + data["Code suggestions"].extend(data_per_chunk["Code suggestions"]) + else: + data.update(data_per_chunk) + self.data = data + return data + + async def rank_suggestions(self, data: List) -> List: + """ + Call a model to rank (sort) code suggestions based on their importance order. + + Args: + data (List): A list of code suggestions to be ranked. + + Returns: + List: The ranked list of code suggestions. + """ + + suggestion_list = [] + # remove invalid suggestions + for i, suggestion in enumerate(data): + if suggestion['existing code'] != suggestion['improved code']: + suggestion_list.append(suggestion) + + data_sorted = [[]] * len(suggestion_list) + + try: + suggestion_str = "" + for i, suggestion in enumerate(suggestion_list): + suggestion_str += f"suggestion {i + 1}: " + str(suggestion) + '\n\n' + + variables = {'suggestion_list': suggestion_list, 'suggestion_str': suggestion_str} + model = get_settings().config.model + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.system).render( + variables) + user_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.user).render(variables) + if get_settings().config.verbosity_level >= 2: + logging.info(f"\nSystem prompt:\n{system_prompt}") + logging.info(f"\nUser prompt:\n{user_prompt}") + response, finish_reason = await self.ai_handler.chat_completion(model=model, system=system_prompt, + user=user_prompt) + + sort_order = load_yaml(response) + for s in sort_order['Sort Order']: + suggestion_number = s['suggestion number'] + importance_order = s['importance order'] + data_sorted[importance_order - 1] = suggestion_list[suggestion_number - 1] + + if get_settings().pr_code_suggestions.final_clip_factor != 1: + new_len = int(0.5 + len(data_sorted) * get_settings().pr_code_suggestions.final_clip_factor) + data_sorted = data_sorted[:new_len] + except Exception as e: + if get_settings().config.verbosity_level >= 1: + logging.info(f"Could not sort suggestions, error: {e}") + data_sorted = suggestion_list + + return data_sorted + +