find_line_number_of_relevant_line_in_file

find_line_number_of_relevant_line_in_file
This commit is contained in:
mrT23
2023-08-05 10:34:09 +03:00
parent bd86266a4b
commit fed0ea349a
7 changed files with 137 additions and 63 deletions

View File

@ -1,8 +1,9 @@
from __future__ import annotations from __future__ import annotations
import re
import difflib
import logging import logging
from typing import Callable, Tuple from typing import Callable, Tuple, List, Any, Sequence
from github import RateLimitExceededException from github import RateLimitExceededException
from pr_agent.algo import MAX_TOKENS from pr_agent.algo import MAX_TOKENS
@ -10,7 +11,7 @@ from pr_agent.algo.git_patch_processing import convert_to_hunks_with_lines_numbe
from pr_agent.algo.language_handler import sort_files_by_main_languages from pr_agent.algo.language_handler import sort_files_by_main_languages
from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.token_handler import TokenHandler
from pr_agent.config_loader import get_settings from pr_agent.config_loader import get_settings
from pr_agent.git_providers.git_provider import GitProvider from pr_agent.git_providers.git_provider import GitProvider, FilePatchInfo
DELETED_FILES_ = "Deleted files:\n" DELETED_FILES_ = "Deleted files:\n"
@ -217,3 +218,53 @@ async def retry_with_fallback_models(f: Callable):
logging.warning(f"Failed to generate prediction with {model}: {e}") logging.warning(f"Failed to generate prediction with {model}: {e}")
if i == len(all_models) - 1: # If it's the last iteration if i == len(all_models) - 1: # If it's the last iteration
raise # Re-raise the last exception raise # Re-raise the last exception
def find_line_number_of_relevant_line_in_file(diff_files: list[FilePatchInfo], relevant_file: str,
relevant_line_in_file: str) -> Tuple[int, int]:
position = -1
absolute_position = -1
RE_HUNK_HEADER = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
for file in diff_files:
if file.filename.strip() == relevant_file:
patch = file.patch
patch_lines = patch.splitlines()
# try to find the line in the patch using difflib, with some margin of error
matches_difflib: list[str | Any] = difflib.get_close_matches(relevant_line_in_file,
file.patch.splitlines(), n=3, cutoff=0.95)
if len(matches_difflib) == 1 and matches_difflib[0].startswith('+'):
relevant_line_in_file = matches_difflib[0]
delta = 0
for i, line in enumerate(patch_lines):
if line.startswith('@@'):
delta = 0
match = RE_HUNK_HEADER.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
if relevant_line_in_file in line and line[0] != '-':
position = i
absolute_position = start2 + delta - 1
break
if position == -1:
for i, line in enumerate(patch_lines):
if line.startswith('@@'):
delta = 0
match = RE_HUNK_HEADER.match(line)
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif not line.startswith('-'):
delta += 1
if relevant_line_in_file[0] == '+' and relevant_line_in_file[1:].lstrip() in line and line[
0] != '-':
# The model often adds a '+' to the beginning of the relevant_line_in_file even if originally
# it's a context line
position = i
absolute_position = start2 + delta - 1
break
return position, absolute_position

View File

@ -40,7 +40,7 @@ def convert_to_markdown(output_data: dict) -> str:
"Security concerns": "🔒", "Security concerns": "🔒",
"General PR suggestions": "💡", "General PR suggestions": "💡",
"Insights from user's answers": "📝", "Insights from user's answers": "📝",
"Code suggestions": "🤖", "Code feedback": "🤖",
} }
for key, value in output_data.items(): for key, value in output_data.items():
@ -50,12 +50,12 @@ def convert_to_markdown(output_data: dict) -> str:
markdown_text += f"## {key}\n\n" markdown_text += f"## {key}\n\n"
markdown_text += convert_to_markdown(value) markdown_text += convert_to_markdown(value)
elif isinstance(value, list): elif isinstance(value, list):
if key.lower() == 'code suggestions': if key.lower() == 'code feedback':
markdown_text += "\n" # just looks nicer with additional line breaks markdown_text += "\n" # just looks nicer with additional line breaks
emoji = emojis.get(key, "") emoji = emojis.get(key, "")
markdown_text += f"- {emoji} **{key}:**\n\n" markdown_text += f"- {emoji} **{key}:**\n\n"
for item in value: for item in value:
if isinstance(item, dict) and key.lower() == 'code suggestions': if isinstance(item, dict) and key.lower() == 'code feedback':
markdown_text += parse_code_suggestion(item) markdown_text += parse_code_suggestion(item)
elif item: elif item:
markdown_text += f" - {item}\n" markdown_text += f" - {item}\n"
@ -100,7 +100,7 @@ def try_fix_json(review, max_iter=10, code_suggestions=False):
Args: Args:
- review: A string containing the JSON message to be fixed. - review: A string containing the JSON message to be fixed.
- max_iter: An integer representing the maximum number of iterations to try and fix the JSON message. - max_iter: An integer representing the maximum number of iterations to try and fix the JSON message.
- code_suggestions: A boolean indicating whether to try and fix JSON messages with code suggestions. - code_suggestions: A boolean indicating whether to try and fix JSON messages with code feedback.
Returns: Returns:
- data: A dictionary containing the parsed JSON data. - data: A dictionary containing the parsed JSON data.
@ -108,7 +108,7 @@ def try_fix_json(review, max_iter=10, code_suggestions=False):
The function attempts to fix broken or incomplete JSON messages by parsing until the last valid code suggestion. The function attempts to fix broken or incomplete JSON messages by parsing until the last valid code suggestion.
If the JSON message ends with a closing bracket, the function calls the fix_json_escape_char function to fix the If the JSON message ends with a closing bracket, the function calls the fix_json_escape_char function to fix the
message. message.
If code_suggestions is True and the JSON message contains code suggestions, the function tries to fix the JSON If code_suggestions is True and the JSON message contains code feedback, the function tries to fix the JSON
message by parsing until the last valid code suggestion. message by parsing until the last valid code suggestion.
The function uses regular expressions to find the last occurrence of "}," with any number of whitespaces or The function uses regular expressions to find the last occurrence of "}," with any number of whitespaces or
newlines. newlines.
@ -128,7 +128,8 @@ def try_fix_json(review, max_iter=10, code_suggestions=False):
else: else:
closing_bracket = "]}}" closing_bracket = "]}}"
if review.rfind("'Code suggestions': [") > 0 or review.rfind('"Code suggestions": [') > 0: if (review.rfind("'Code feedback': [") > 0 or review.rfind('"Code feedback": [') > 0) or \
(review.rfind("'Code suggestions': [") > 0 or review.rfind('"Code suggestions": [') > 0) :
last_code_suggestion_ind = [m.end() for m in re.finditer(r"\}\s*,", review)][-1] - 1 last_code_suggestion_ind = [m.end() for m in re.finditer(r"\}\s*,", review)][-1] - 1
valid_json = False valid_json = False
iter_count = 0 iter_count = 0

View File

@ -1,4 +1,6 @@
import logging import logging
import hashlib
from datetime import datetime from datetime import datetime
from typing import Optional, Tuple from typing import Optional, Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
@ -10,6 +12,7 @@ from starlette_context import context
from .git_provider import FilePatchInfo, GitProvider, IncrementalPR from .git_provider import FilePatchInfo, GitProvider, IncrementalPR
from ..algo.language_handler import is_valid_file from ..algo.language_handler import is_valid_file
from ..algo.utils import load_large_diff from ..algo.utils import load_large_diff
from ..algo.pr_processing import find_line_number_of_relevant_line_in_file
from ..config_loader import get_settings from ..config_loader import get_settings
from ..servers.utils import RateLimitExceeded from ..servers.utils import RateLimitExceeded
@ -148,22 +151,9 @@ class GithubProvider(GitProvider):
def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str):
self.publish_inline_comments([self.create_inline_comment(body, relevant_file, relevant_line_in_file)]) self.publish_inline_comments([self.create_inline_comment(body, relevant_file, relevant_line_in_file)])
def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str):
diff_files = self.get_diff_files() position = find_line_number_of_relevant_line_in_file(self.diff_files, relevant_file.strip('`'), relevant_line_in_file)
position = -1
for file in diff_files:
if file.filename.strip() == relevant_file:
patch = file.patch
patch_lines = patch.splitlines()
for i, line in enumerate(patch_lines):
if relevant_line_in_file in line:
position = i
break
elif relevant_line_in_file[0] == '+' and relevant_line_in_file[1:].lstrip() in line:
# The model often adds a '+' to the beginning of the relevant_line_in_file even if originally
# it's a context line
position = i
break
if position == -1: if position == -1:
if get_settings().config.verbosity_level >= 2: if get_settings().config.verbosity_level >= 2:
logging.info(f"Could not find position for {relevant_file} {relevant_line_in_file}") logging.info(f"Could not find position for {relevant_file} {relevant_line_in_file}")
@ -171,8 +161,6 @@ class GithubProvider(GitProvider):
else: else:
subject_type = "LINE" subject_type = "LINE"
path = relevant_file.strip() path = relevant_file.strip()
# placeholder for future API support (already supported in single inline comment)
# return dict(body=body, path=path, position=position, subject_type=subject_type)
return dict(body=body, path=path, position=position) if subject_type == "LINE" else {} return dict(body=body, path=path, position=position) if subject_type == "LINE" else {}
def publish_inline_comments(self, comments: list[dict]): def publish_inline_comments(self, comments: list[dict]):
@ -384,3 +372,25 @@ class GithubProvider(GitProvider):
except: except:
commit_messages_str = "" commit_messages_str = ""
return commit_messages_str return commit_messages_str
def generate_link_to_relevant_line_number(self, suggestion) -> str:
try:
relevant_file = suggestion['relevant file']
relevant_line_str = suggestion['relevant line']
position, absolute_position = find_line_number_of_relevant_line_in_file \
(self.diff_files, relevant_file.strip('`'), relevant_line_str)
if absolute_position != -1:
# # link to right file only
# link = f"https://github.com/{self.repo}/blob/{self.pr.head.sha}/{relevant_file}" \
# + "#" + f"L{absolute_position}"
# link to diff
sha_file = hashlib.sha256(relevant_file.encode('utf-8')).hexdigest()
link = f"https://github.com/{self.repo}/pull/{self.pr_num}/files#diff-{sha_file}R{absolute_position}"
return link
except Exception as e:
if get_settings().config.verbosity_level >= 2:
logging.info(f"Failed adding line link, error: {e}")
return ""

View File

@ -344,4 +344,4 @@ class GitLabProvider(GitProvider):
commit_messages_str = "\n".join([f"{i + 1}. {message}" for i, message in enumerate(commit_messages_list)]) commit_messages_str = "\n".join([f"{i + 1}. {message}" for i, message in enumerate(commit_messages_list)])
except: except:
commit_messages_str = "" commit_messages_str = ""
return commit_messages_str return commit_messages_str

View File

@ -13,8 +13,8 @@ require_focused_review=true
require_score_review=false require_score_review=false
require_tests_review=true require_tests_review=true
require_security_review=true require_security_review=true
num_code_suggestions=0 num_code_suggestions=3
inline_code_comments = true inline_code_comments = false
ask_and_reflect=false ask_and_reflect=false
extra_instructions = "" extra_instructions = ""

View File

@ -1,9 +1,9 @@
[pr_review_prompt] [pr_review_prompt]
system="""You are CodiumAI-PR-Reviewer, a language model designed to review git pull requests. system="""You are CodiumAI-PR-Reviewer, a language model designed to review git pull requests.
Your task is to provide constructive and concise feedback for the PR, and also provide meaningfull code suggestions to improve the new PR code (the '+' lines). Your task is to provide constructive and concise feedback for the PR, and also provide meaningfull code suggestions to improve the new PR code (the '+' lines).
- Provide up to {{ num_code_suggestions }} code suggestions.
{%- if num_code_suggestions > 0 %} {%- if num_code_suggestions > 0 %}
- Try to focus on important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningfull code improvements, like performance, vulnerability, modularity, and best practices. - Provide up to {{ num_code_suggestions }} code suggestions.
- Try to focus on the most important suggestions, like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningfull code improvements, like performance, vulnerability, modularity, and best practices.
- Suggestions should focus on improving the new added code lines. - Suggestions should focus on improving the new added code lines.
- Make sure not to provide suggestions repeating modifications already implemented in the new PR code (the '+' lines). - Make sure not to provide suggestions repeating modifications already implemented in the new PR code (the '+' lines).
{%- endif %} {%- endif %}
@ -24,7 +24,7 @@ You must use the following JSON schema to format your answer:
}, },
"Type of PR": { "Type of PR": {
"type": "string", "type": "string",
"enum": ["Bug fix", "Tests", "Bug fix with tests", "Refactoring", "Enhancement", "Documentation", "Other"] "enum": ["Bug fix", "Tests", "Refactoring", "Enhancement", "Documentation", "Other"]
}, },
{%- if require_score %} {%- if require_score %}
"Score": { "Score": {
@ -47,17 +47,17 @@ You must use the following JSON schema to format your answer:
{%- if require_focused %} {%- if require_focused %}
"Focused PR": { "Focused PR": {
"type": "string", "type": "string",
"description": "Is this a focused PR, in the sense that it has a clear and coherent title and description, and all PR code diff changes are properly derived from the title and description? Explain your response." "description": "Is this a focused PR, in the sense that all the PR code diff changes are united under a single focused theme ? If the theme is too broad, or the PR code diff changes are too scattered, then the PR is not focused. Explain your answer shortly."
} }
}, },
{%- endif %} {%- endif %}
"PR Feedback": { "PR Feedback": {
"General PR suggestions": { "General suggestions": {
"type": "string", "type": "string",
"description": "General suggestions and feedback for the contributors and maintainers of this PR. May include important suggestions for the overall structure, primary purpose, best practices, critical bugs, and other aspects of the PR. Explain your suggestions." "description": "General suggestions and feedback for the contributors and maintainers of this PR. May include important suggestions for the overall structure, primary purpose, best practices, critical bugs, and other aspects of the PR. Don't address PR title and description, or lack of tests. Explain your suggestions."
}, },
{%- if num_code_suggestions > 0 %} {%- if num_code_suggestions > 0 %}
"Code suggestions": { "Code feedback": {
"type": "array", "type": "array",
"maxItems": {{ num_code_suggestions }}, "maxItems": {{ num_code_suggestions }},
"uniqueItems": true, "uniqueItems": true,
@ -66,13 +66,13 @@ You must use the following JSON schema to format your answer:
"type": "string", "type": "string",
"description": "the relevant file full path" "description": "the relevant file full path"
}, },
"suggestion content": { "suggestion": {
"type": "string", "type": "string",
"description": "a concrete suggestion for meaningfully improving the new PR code. Also describe how, specifically, the suggestion can be applied to new PR code. Add tags with importance measure that matches each suggestion ('important' or 'medium'). Do not make suggestions for updating or adding docstrings, renaming PR title and description, or linter like. "description": "a concrete suggestion for meaningfully improving the new PR code. Also describe how, specifically, the suggestion can be applied to new PR code. Add tags with importance measure that matches each suggestion ('important' or 'medium'). Do not make suggestions for updating or adding docstrings, renaming PR title and description, or linter like.
}, },
"relevant line in file": { "relevant line": {
"type": "string", "type": "string",
"description": "an authentic single code line from the PR git diff section, to which the suggestion applies." "description": "a single code line taken from the relevant file, to which the suggestion applies. The line should be a '+' line. Make sure to output the line exactly as it appears in the relevant file"
} }
} }
}, },
@ -80,8 +80,8 @@ You must use the following JSON schema to format your answer:
{%- if require_security %} {%- if require_security %}
"Security concerns": { "Security concerns": {
"type": "string", "type": "string",
"description": "yes\\no question: does this PR code introduce possible security concerns or issues, like SQL injection, XSS, CSRF, and others ? explain your answer" "description": "yes\\no question: does this PR code introduce possible security concerns or issues, like SQL injection, XSS, CSRF, and others ? If answered 'yes', explain your answer shortly"
? explain your answer" ? explain your answer shortly"
} }
{%- endif %} {%- endif %}
} }
@ -109,11 +109,11 @@ Example output:
{ {
"General PR suggestions": "..., `xxx`...", "General PR suggestions": "..., `xxx`...",
{%- if num_code_suggestions > 0 %} {%- if num_code_suggestions > 0 %}
"Code suggestions": [ "Code feedback": [
{ {
"relevant file": "directory/xxx.py", "relevant file": "directory/xxx.py",
"suggestion content": "xxx [important]", "suggestion": "xxx [important]",
"relevant line in file": "xxx", "relevant line": "xxx",
}, },
... ...
] ]

View File

@ -7,7 +7,8 @@ from typing import List, Tuple
from jinja2 import Environment, StrictUndefined from jinja2 import Environment, StrictUndefined
from pr_agent.algo.ai_handler import AiHandler from pr_agent.algo.ai_handler import AiHandler
from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, \
find_line_number_of_relevant_line_in_file
from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.token_handler import TokenHandler
from pr_agent.algo.utils import convert_to_markdown, try_fix_json from pr_agent.algo.utils import convert_to_markdown, try_fix_json
from pr_agent.config_loader import get_settings from pr_agent.config_loader import get_settings
@ -160,27 +161,38 @@ class PRReviewer:
the feedback. the feedback.
""" """
review = self.prediction.strip() review = self.prediction.strip()
try: try:
data = json.loads(review) data = json.loads(review)
except json.decoder.JSONDecodeError: except json.decoder.JSONDecodeError:
data = try_fix_json(review) data = try_fix_json(review)
# Move 'Security concerns' key to 'PR Analysis' section for better display # Move 'Security concerns' key to 'PR Analysis' section for better display
if 'PR Feedback' in data and 'Security concerns' in data['PR Feedback']: pr_feedback = data.get('PR Feedback', {})
val = data['PR Feedback']['Security concerns'] security_concerns = pr_feedback.get('Security concerns')
del data['PR Feedback']['Security concerns'] if security_concerns:
data['PR Analysis']['Security concerns'] = val del pr_feedback['Security concerns']
data.setdefault('PR Analysis', {})['Security concerns'] = security_concerns
# Filter out code suggestions that can be submitted as inline comments #
if get_settings().config.git_provider != 'bitbucket' and get_settings().pr_reviewer.inline_code_comments \ if 'Code feedback' in pr_feedback:
and 'Code suggestions' in data['PR Feedback']: code_feedback = pr_feedback['Code feedback']
data['PR Feedback']['Code suggestions'] = [
d for d in data['PR Feedback']['Code suggestions'] # Filter out code suggestions that can be submitted as inline comments
if any(key not in d for key in ('relevant file', 'relevant line in file', 'suggestion content')) if get_settings().pr_reviewer.inline_code_comments:
] del pr_feedback['Code feedback']
if not data['PR Feedback']['Code suggestions']: else:
del data['PR Feedback']['Code suggestions'] for suggestion in code_feedback:
relevant_line_str = suggestion['relevant line'].split('\n')[0]
# removing '+'
suggestion['relevant line'] = relevant_line_str.lstrip('+').strip()
# try to add line numbers link to code suggestions
if hasattr(self.git_provider, 'generate_link_to_relevant_line_number'):
link = self.git_provider.generate_link_to_relevant_line_number(suggestion)
if link:
suggestion['relevant line'] = f"[{suggestion['relevant line']}]({link})"
# Add incremental review section # Add incremental review section
if self.incremental.is_incremental: if self.incremental.is_incremental:
@ -205,7 +217,7 @@ class PRReviewer:
# Log markdown response if verbosity level is high # Log markdown response if verbosity level is high
if get_settings().config.verbosity_level >= 2: if get_settings().config.verbosity_level >= 2:
logging.info(f"Markdown response:\n{markdown_text}") logging.info(f"Markdown response:\n{markdown_text}")
return markdown_text return markdown_text
def _publish_inline_code_comments(self) -> None: def _publish_inline_code_comments(self) -> None:
@ -222,10 +234,10 @@ class PRReviewer:
data = try_fix_json(review) data = try_fix_json(review)
comments: List[str] = [] comments: List[str] = []
for suggestion in data.get('PR Feedback', {}).get('Code suggestions', []): for suggestion in data.get('PR Feedback', {}).get('Code feedback', []):
relevant_file = suggestion.get('relevant file', '').strip() relevant_file = suggestion.get('relevant file', '').strip()
relevant_line_in_file = suggestion.get('relevant line in file', '').strip() relevant_line_in_file = suggestion.get('relevant line', '').strip()
content = suggestion.get('suggestion content', '') content = suggestion.get('suggestion', '')
if not relevant_file or not relevant_line_in_file or not content: if not relevant_file or not relevant_line_in_file or not content:
logging.info("Skipping inline comment with missing file/line/content") logging.info("Skipping inline comment with missing file/line/content")
continue continue