Merge pull request #50 from Codium-ai/tr/numbered_hunks

Adding numbered hunks and code suggestions feature
This commit is contained in:
mrT23
2023-07-16 12:27:29 +03:00
committed by GitHub
11 changed files with 382 additions and 13 deletions

View File

@ -108,3 +108,78 @@ def handle_patch_deletions(patch: str, original_file_content_str: str,
logging.info(f"Processing file: {file_name}, hunks were deleted") logging.info(f"Processing file: {file_name}, hunks were deleted")
patch = patch_new patch = patch_new
return patch return patch
def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
# toDO: (maybe remove '-' and '+' from the beginning of the line)
"""
## src/file.ts
--new hunk--
881 line1
882 line2
883 line3
884 line4
885 line6
886 line7
887 + line8
888 + line9
889 line10
890 line11
...
--old hunk--
line1
line2
- line3
- line4
line5
line6
...
"""
patch_with_lines_str = f"## {file.filename}\n"
import re
patch_lines = patch.splitlines()
RE_HUNK_HEADER = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
new_content_lines = []
old_content_lines = []
match = None
start1, size1, start2, size2 = -1, -1, -1, -1
for line in patch_lines:
if 'no newline at end of file' in line.lower():
continue
if line.startswith('@@'):
match = RE_HUNK_HEADER.match(line)
if match and new_content_lines: # found a new hunk, split the previous lines
if new_content_lines:
patch_with_lines_str += '\n--new hunk--\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str += '--old hunk--\n'
for i, line_old in enumerate(old_content_lines):
patch_with_lines_str += f"{line_old}\n"
new_content_lines = []
old_content_lines = []
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif line.startswith('+'):
new_content_lines.append(line)
elif line.startswith('-'):
old_content_lines.append(line)
else:
new_content_lines.append(line)
old_content_lines.append(line)
# finishing last hunk
if match and new_content_lines:
if new_content_lines:
patch_with_lines_str += '\n--new hunk--\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str += '\n--old hunk--\n'
for i, line_old in enumerate(old_content_lines):
patch_with_lines_str += f"{line_old}\n"
return patch_with_lines_str.strip()

View File

@ -4,7 +4,8 @@ import difflib
import logging import logging
from typing import Any, Tuple, Union from typing import Any, Tuple, Union
from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions, \
convert_to_hunks_with_lines_numbers
from pr_agent.algo.language_handler import sort_files_by_main_languages from pr_agent.algo.language_handler import sort_files_by_main_languages
from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.token_handler import TokenHandler
from pr_agent.config_loader import settings from pr_agent.config_loader import settings
@ -19,26 +20,33 @@ OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600
PATCH_EXTRA_LINES = 3 PATCH_EXTRA_LINES = 3
def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler) -> str: def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler,
add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool =False) -> str:
""" """
Returns a string with the diff of the PR. Returns a string with the diff of the PR.
If needed, apply diff minimization techniques to reduce the number of tokens If needed, apply diff minimization techniques to reduce the number of tokens
""" """
if disable_extra_lines:
global PATCH_EXTRA_LINES
PATCH_EXTRA_LINES = 0
git_provider.pr.diff_files = list(git_provider.get_diff_files()) git_provider.pr.diff_files = list(git_provider.get_diff_files())
# get pr languages # get pr languages
pr_languages = sort_files_by_main_languages(git_provider.get_languages(), git_provider.pr.diff_files) pr_languages = sort_files_by_main_languages(git_provider.get_languages(), git_provider.pr.diff_files)
# generate a standard diff string, with patch extension # generate a standard diff string, with patch extension
patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler) patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler,
add_line_numbers_to_hunks)
# if we are under the limit, return the full diff # if we are under the limit, return the full diff
if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < token_handler.limit: if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < token_handler.limit:
return "\n".join(patches_extended) return "\n".join(patches_extended)
# if we are over the limit, start pruning # if we are over the limit, start pruning
patches_compressed, modified_file_names, deleted_file_names = pr_generate_compressed_diff(pr_languages, patches_compressed, modified_file_names, deleted_file_names = \
token_handler) pr_generate_compressed_diff(pr_languages, token_handler, add_line_numbers_to_hunks)
final_diff = "\n".join(patches_compressed) final_diff = "\n".join(patches_compressed)
if modified_file_names: if modified_file_names:
modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names) modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names)
@ -49,7 +57,8 @@ def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHa
return final_diff return final_diff
def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -> \ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler,
add_line_numbers_to_hunks: bool) -> \
Tuple[list, int]: Tuple[list, int]:
""" """
Generate a standard diff string, with patch extension Generate a standard diff string, with patch extension
@ -72,6 +81,9 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -
extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES) extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES)
full_extended_patch = f"## {file.filename}\n\n{extended_patch}\n" full_extended_patch = f"## {file.filename}\n\n{extended_patch}\n"
if add_line_numbers_to_hunks:
full_extended_patch = convert_to_hunks_with_lines_numbers(extended_patch, file)
patch_tokens = token_handler.count_tokens(full_extended_patch) patch_tokens = token_handler.count_tokens(full_extended_patch)
file.tokens = patch_tokens file.tokens = patch_tokens
total_tokens += patch_tokens total_tokens += patch_tokens
@ -80,7 +92,8 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -
return patches_extended, total_tokens return patches_extended, total_tokens
def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> Tuple[list, list, list]: def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler,
convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list]:
# Apply Diff Minimization techniques to reduce the number of tokens: # Apply Diff Minimization techniques to reduce the number of tokens:
# 0. Start from the largest diff patch to smaller ones # 0. Start from the largest diff patch to smaller ones
# 1. Don't use extend context lines around diff # 1. Don't use extend context lines around diff
@ -114,6 +127,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) ->
deleted_files_list.append(file.filename) deleted_files_list.append(file.filename)
total_tokens += token_handler.count_tokens(file.filename) + 1 total_tokens += token_handler.count_tokens(file.filename) + 1
continue continue
if convert_hunks_to_line_numbers:
patch = convert_to_hunks_with_lines_numbers(patch, file)
new_patch_tokens = token_handler.count_tokens(patch) new_patch_tokens = token_handler.count_tokens(patch)
# Hard Stop, no more tokens # Hard Stop, no more tokens
@ -135,7 +152,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) ->
continue continue
if patch: if patch:
if not convert_hunks_to_line_numbers:
patch_final = f"## {file.filename}\n\n{patch}\n" patch_final = f"## {file.filename}\n\n{patch}\n"
else:
patch_final = patch
patches.append(patch_final) patches.append(patch_final)
total_tokens += token_handler.count_tokens(patch_final) total_tokens += token_handler.count_tokens(patch_final)
if settings.config.verbosity_level >= 2: if settings.config.verbosity_level >= 2:

View File

@ -3,6 +3,7 @@ import asyncio
import logging import logging
import os import os
from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions
from pr_agent.tools.pr_description import PRDescription from pr_agent.tools.pr_description import PRDescription
from pr_agent.tools.pr_questions import PRQuestions from pr_agent.tools.pr_questions import PRQuestions
from pr_agent.tools.pr_reviewer import PRReviewer from pr_agent.tools.pr_reviewer import PRReviewer
@ -12,7 +13,8 @@ def run():
parser = argparse.ArgumentParser(description='AI based pull request analyzer') parser = argparse.ArgumentParser(description='AI based pull request analyzer')
parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', required=True) parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', required=True)
parser.add_argument('--question', type=str, help='Optional question to ask', required=False) parser.add_argument('--question', type=str, help='Optional question to ask', required=False)
parser.add_argument('--pr_description', action='store_true', help='Optional question to ask', required=False) parser.add_argument('--pr_description', action='store_true', required=False)
parser.add_argument('--pr_code_suggestions', action='store_true', required=False)
args = parser.parse_args() args = parser.parse_args()
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
if args.question: if args.question:
@ -23,6 +25,10 @@ def run():
print(f"PR description: {args.pr_url}") print(f"PR description: {args.pr_url}")
reviewer = PRDescription(args.pr_url) reviewer = PRDescription(args.pr_url)
asyncio.run(reviewer.describe()) asyncio.run(reviewer.describe())
elif args.pr_code_suggestions:
print(f"PR code suggestions: {args.pr_url}")
reviewer = PRCodeSuggestions(args.pr_url)
asyncio.run(reviewer.suggest())
else: else:
print(f"Reviewing PR: {args.pr_url}") print(f"Reviewing PR: {args.pr_url}")
reviewer = PRReviewer(args.pr_url, cli_mode=True) reviewer = PRReviewer(args.pr_url, cli_mode=True)

View File

@ -12,6 +12,7 @@ settings = Dynaconf(
"settings/pr_reviewer_prompts.toml", "settings/pr_reviewer_prompts.toml",
"settings/pr_questions_prompts.toml", "settings/pr_questions_prompts.toml",
"settings/pr_description_prompts.toml", "settings/pr_description_prompts.toml",
"settings/pr_code_suggestions_prompts.toml",
"settings_prod/.secrets.toml" "settings_prod/.secrets.toml"
]] ]]
) )

View File

@ -37,6 +37,11 @@ class GitProvider(ABC):
def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str):
pass pass
@abstractmethod
def publish_code_suggestion(self, body: str, relevant_file: str,
relevant_lines_start: int, relevant_lines_end: int):
pass
@abstractmethod @abstractmethod
def remove_initial_comment(self): def remove_initial_comment(self):
pass pass

View File

@ -7,10 +7,10 @@ from github import AppAuthentication, Github
from pr_agent.config_loader import settings from pr_agent.config_loader import settings
from .git_provider import FilePatchInfo from .git_provider import FilePatchInfo, GitProvider
class GithubProvider: class GithubProvider(GitProvider):
def __init__(self, pr_url: Optional[str] = None): def __init__(self, pr_url: Optional[str] = None):
self.installation_id = settings.get("GITHUB.INSTALLATION_ID") self.installation_id = settings.get("GITHUB.INSTALLATION_ID")
self.github_client = self._get_github_client() self.github_client = self._get_github_client()
@ -76,6 +76,53 @@ class GithubProvider:
path = relevant_file.strip() path = relevant_file.strip()
self.pr.create_review_comment(body=body, commit_id=self.last_commit_id, path=path, position=position) self.pr.create_review_comment(body=body, commit_id=self.last_commit_id, path=path, position=position)
def publish_code_suggestion(self, body: str,
relevant_file: str,
relevant_lines_start: int,
relevant_lines_end: int):
if not relevant_lines_start or relevant_lines_start == -1:
if settings.config.verbosity_level >= 2:
logging.exception(f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}")
return False
if relevant_lines_end<relevant_lines_start:
if settings.config.verbosity_level >= 2:
logging.exception(f"Failed to publish code suggestion, "
f"relevant_lines_end is {relevant_lines_end} and "
f"relevant_lines_start is {relevant_lines_start}")
return False
try:
import github.PullRequestComment
if relevant_lines_end > relevant_lines_start:
post_parameters = {
"body": body,
"commit_id": self.last_commit_id._identity,
"path": relevant_file,
"line": relevant_lines_end,
"start_line": relevant_lines_start,
"start_side": "RIGHT",
}
else: # API is different for single line comments
post_parameters = {
"body": body,
"commit_id": self.last_commit_id._identity,
"path": relevant_file,
"line": relevant_lines_start,
"side": "RIGHT",
}
headers, data = self.pr._requester.requestJsonAndCheck(
"POST", f"{self.pr.url}/comments", input=post_parameters
)
github.PullRequestComment.PullRequestComment(
self.pr._requester, headers, data, completed=True
)
return True
except Exception as e:
if settings.config.verbosity_level >= 2:
logging.error(f"Failed to publish code suggestion, error: {e}")
return False
def remove_initial_comment(self): def remove_initial_comment(self):
try: try:
for comment in self.pr.comments_list: for comment in self.pr.comments_list:

View File

@ -106,6 +106,12 @@ class GitLabProvider(GitProvider):
self.mr.discussions.create({'body': body, self.mr.discussions.create({'body': body,
'position': pos_obj}) 'position': pos_obj})
def publish_code_suggestion(self, body: str,
relevant_file: str,
relevant_lines_start: int,
relevant_lines_end: int):
raise "not implemented yet for gitlab"
def search_line(self, relevant_file, relevant_line_in_file): def search_line(self, relevant_file, relevant_line_in_file):
RE_HUNK_HEADER = re.compile( RE_HUNK_HEADER = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")

View File

@ -8,11 +8,14 @@ verbosity_level=0 # 0,1,2
require_focused_review=true require_focused_review=true
require_tests_review=true require_tests_review=true
require_security_review=true require_security_review=true
num_code_suggestions=4 num_code_suggestions=3
inline_code_comments = true inline_code_comments = true
[pr_questions] [pr_questions]
[pr_code_suggestions]
num_code_suggestions=4
[github] [github]
# The type of deployment to create. Valid values are 'app' or 'user'. # The type of deployment to create. Valid values are 'app' or 'user'.
deployment_type = "user" deployment_type = "user"

View File

@ -0,0 +1,79 @@
[pr_code_suggestions_prompt]
system="""You are a language model called CodiumAI-PR-Code-Reviewer.
Your task is to provide provide meaningfull non-trivial code suggestions to improve the new code in a PR (the '+' lines).
- Try to give important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningfull code improvements, like performance, vulnerability, modularity, and best practices.
- Suggestions should refer only to the 'new hunk' code, and focus on improving the new added code lines, with '+'.
- Provide the exact line number range (inclusive) for each issue.
- Assume there is additional code in the relevant file that is not included in the diff.
- Provide up to {{ num_code_suggestions }} code suggestions.
- Make sure not to provide suggestion repeating modifications already implemented in the new PR code (the '+' lines).
- Don't output line numbers in the 'improved code' snippets.
You must use the following JSON schema to format your answer:
```json
{
"Code suggestions": {
"type": "array",
"minItems": 1,
"maxItems": {{ num_code_suggestions }},
"uniqueItems": "true",
"items": {
"relevant file": {
"type": "string",
"description": "the relevant file full path"
},
"suggestion content": {
"type": "string",
"description": "a concrete suggestion for meaningfully improving the new PR code."
},
"existing code": {
"type": "string",
"description": "a code snippet showing authentic relevant code lines from a 'new hunk' section. It must be continuous, correctly formatted and indented, and without line numbers."
},
"relevant lines": {
"type": "string",
"description": "the relevant lines in the 'new hunk' sections, in the format of 'start_line-end_line'. For example: '10-15'. They should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above."
},
"improved code": {
"type": "string",
"description": "a new code snippet that can be used to replace the relevant lines in 'new hunk' code. Replacement suggestions should be complete, correctly formatted and indented, and without line numbers."
}
}
}
}
```
Example input:
'
## src/file1.py
---new_hunk---
```
[new hunk code, annotated with line numbers]
```
---old_hunk---
```
[old hunk code]
```
...
'
Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields.
"""
user="""PR Info:
Title: '{{title}}'
Branch: '{{branch}}'
Description: '{{description}}'
{%- if language %}
Main language: {{language}}
{%- endif %}
The PR Diff:
```
{{diff}}
```
Response (should be a valid JSON, and nothing else):
```json
"""

View File

@ -0,0 +1,127 @@
import copy
import json
import logging
import textwrap
from jinja2 import Environment, StrictUndefined
from pr_agent.algo.ai_handler import AiHandler
from pr_agent.algo.pr_processing import get_pr_diff
from pr_agent.algo.token_handler import TokenHandler
from pr_agent.algo.utils import convert_to_markdown, try_fix_json
from pr_agent.config_loader import settings
from pr_agent.git_providers import get_git_provider, GithubProvider
from pr_agent.git_providers.git_provider import get_main_pr_language
class PRCodeSuggestions:
def __init__(self, pr_url: str, cli_mode=False):
self.git_provider = get_git_provider()(pr_url)
self.main_language = get_main_pr_language(
self.git_provider.get_languages(), self.git_provider.get_files()
)
self.ai_handler = AiHandler()
self.patches_diff = None
self.prediction = None
self.cli_mode = cli_mode
self.vars = {
"title": self.git_provider.pr.title,
"branch": self.git_provider.get_pr_branch(),
"description": self.git_provider.get_pr_description(),
"language": self.main_language,
"diff": "", # empty diff for initial calculation
'num_code_suggestions': settings.pr_code_suggestions.num_code_suggestions,
}
self.token_handler = TokenHandler(self.git_provider.pr,
self.vars,
settings.pr_code_suggestions_prompt.system,
settings.pr_code_suggestions_prompt.user)
async def suggest(self):
assert type(self.git_provider) == GithubProvider, "Only Github is supported for now"
logging.info('Generating code suggestions for PR...')
if settings.config.publish_review:
self.git_provider.publish_comment("Preparing review...", is_temporary=True)
logging.info('Getting PR diff...')
# we are using extended hunk with line numbers for code suggestions
self.patches_diff = get_pr_diff(self.git_provider,
self.token_handler,
add_line_numbers_to_hunks=True,
disable_extra_lines=True)
logging.info('Getting AI prediction...')
self.prediction = await self._get_prediction()
logging.info('Preparing PR review...')
data = self._prepare_pr_code_suggestions()
if settings.config.publish_review:
logging.info('Pushing PR review...')
self.git_provider.remove_initial_comment()
logging.info('Pushing inline code comments...')
self.push_inline_code_suggestions(data)
async def _get_prediction(self):
variables = copy.deepcopy(self.vars)
variables["diff"] = self.patches_diff # update diff
environment = Environment(undefined=StrictUndefined)
system_prompt = environment.from_string(settings.pr_code_suggestions_prompt.system).render(variables)
user_prompt = environment.from_string(settings.pr_code_suggestions_prompt.user).render(variables)
if settings.config.verbosity_level >= 2:
logging.info(f"\nSystem prompt:\n{system_prompt}")
logging.info(f"\nUser prompt:\n{user_prompt}")
model = settings.config.model
response, finish_reason = await self.ai_handler.chat_completion(model=model, temperature=0.2,
system=system_prompt, user=user_prompt)
return response
def _prepare_pr_code_suggestions(self) -> str:
review = self.prediction.strip()
data = None
try:
data = json.loads(review)
except json.decoder.JSONDecodeError:
if settings.config.verbosity_level >= 2:
logging.info(f"Could not parse json response: {review}")
data = try_fix_json(review)
return data
def push_inline_code_suggestions(self, data):
for d in data['Code suggestions']:
if settings.config.verbosity_level >= 2:
logging.info(f"suggestion: {d}")
relevant_file = d['relevant file'].strip()
relevant_lines_str = d['relevant lines'].strip()
relevant_lines_start = int(relevant_lines_str.split('-')[0]) # absolute position
relevant_lines_end = int(relevant_lines_str.split('-')[-1])
content = d['suggestion content']
existing_code_snippet = d['existing code']
new_code_snippet = d['improved code']
if new_code_snippet:
try: # dedent code snippet
self.diff_files = self.git_provider.diff_files if self.git_provider.diff_files else self.git_provider.get_diff_files()
original_initial_line = None
for file in self.diff_files:
if file.filename.strip() == relevant_file:
original_initial_line = file.head_file.splitlines()[relevant_lines_start - 1]
break
if original_initial_line:
suggested_initial_line = new_code_snippet.splitlines()[0]
original_initial_spaces = len(original_initial_line) - len(original_initial_line.lstrip())
suggested_initial_spaces = len(suggested_initial_line) - len(suggested_initial_line.lstrip())
delta_spaces = original_initial_spaces - suggested_initial_spaces
if delta_spaces > 0:
new_code_snippet = textwrap.indent(new_code_snippet, delta_spaces * " ").rstrip('\n')
except Exception as e:
if settings.config.verbosity_level >= 2:
logging.info(f"Could not dedent code snippet for file {relevant_file}, error: {e}")
body = f"**Suggestion:** {content}\n```suggestion\n" + new_code_snippet + "\n```"
success = self.git_provider.publish_code_suggestion(body=body,
relevant_file=relevant_file,
relevant_lines_start=relevant_lines_start,
relevant_lines_end=relevant_lines_end)

View File

@ -35,7 +35,7 @@ class PRDescription:
self.prediction = None self.prediction = None
async def describe(self): async def describe(self):
logging.info('Answering a PR question...') logging.info('Generating a PR description...')
if settings.config.publish_review: if settings.config.publish_review:
self.git_provider.publish_comment("Preparing pr description...", is_temporary=True) self.git_provider.publish_comment("Preparing pr description...", is_temporary=True)
logging.info('Getting PR diff...') logging.info('Getting PR diff...')