mirror of
https://github.com/qodo-ai/pr-agent.git
synced 2025-07-04 12:50:38 +08:00
Merge pull request #173 from Codium-ai/tr/caching
Optimization of PR Diff Processing
This commit is contained in:
@ -1,3 +1,10 @@
|
|||||||
|
## 2023-08-03
|
||||||
|
|
||||||
|
### Optimized
|
||||||
|
- Optimized PR diff processing by introducing caching for diff files, reducing the number of API calls.
|
||||||
|
- Refactored `load_large_diff` function to generate a patch only when necessary.
|
||||||
|
- Fixed a bug in the GitLab provider where the new file was not retrieved correctly.
|
||||||
|
|
||||||
## 2023-08-02
|
## 2023-08-02
|
||||||
|
|
||||||
### Enhanced
|
### Enhanced
|
||||||
|
@ -9,7 +9,6 @@ from pr_agent.algo import MAX_TOKENS
|
|||||||
from pr_agent.algo.git_patch_processing import convert_to_hunks_with_lines_numbers, extend_patch, handle_patch_deletions
|
from pr_agent.algo.git_patch_processing import convert_to_hunks_with_lines_numbers, extend_patch, handle_patch_deletions
|
||||||
from pr_agent.algo.language_handler import sort_files_by_main_languages
|
from pr_agent.algo.language_handler import sort_files_by_main_languages
|
||||||
from pr_agent.algo.token_handler import TokenHandler
|
from pr_agent.algo.token_handler import TokenHandler
|
||||||
from pr_agent.algo.utils import load_large_diff
|
|
||||||
from pr_agent.config_loader import get_settings
|
from pr_agent.config_loader import get_settings
|
||||||
from pr_agent.git_providers.git_provider import GitProvider
|
from pr_agent.git_providers.git_provider import GitProvider
|
||||||
|
|
||||||
@ -46,7 +45,7 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s
|
|||||||
PATCH_EXTRA_LINES = 0
|
PATCH_EXTRA_LINES = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
diff_files = list(git_provider.get_diff_files())
|
diff_files = git_provider.get_diff_files()
|
||||||
except RateLimitExceededException as e:
|
except RateLimitExceededException as e:
|
||||||
logging.error(f"Rate limit exceeded for git provider API. original message {e}")
|
logging.error(f"Rate limit exceeded for git provider API. original message {e}")
|
||||||
raise
|
raise
|
||||||
@ -98,12 +97,7 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler,
|
|||||||
for lang in pr_languages:
|
for lang in pr_languages:
|
||||||
for file in lang['files']:
|
for file in lang['files']:
|
||||||
original_file_content_str = file.base_file
|
original_file_content_str = file.base_file
|
||||||
new_file_content_str = file.head_file
|
|
||||||
patch = file.patch
|
patch = file.patch
|
||||||
|
|
||||||
# handle the case of large patch, that initially was not loaded
|
|
||||||
patch = load_large_diff(file, new_file_content_str, original_file_content_str, patch)
|
|
||||||
|
|
||||||
if not patch:
|
if not patch:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -161,7 +155,6 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
|
|||||||
original_file_content_str = file.base_file
|
original_file_content_str = file.base_file
|
||||||
new_file_content_str = file.head_file
|
new_file_content_str = file.head_file
|
||||||
patch = file.patch
|
patch = file.patch
|
||||||
patch = load_large_diff(file, new_file_content_str, original_file_content_str, patch)
|
|
||||||
if not patch:
|
if not patch:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -195,35 +195,27 @@ def convert_str_to_datetime(date_str):
|
|||||||
return datetime.strptime(date_str, datetime_format)
|
return datetime.strptime(date_str, datetime_format)
|
||||||
|
|
||||||
|
|
||||||
def load_large_diff(file, new_file_content_str: str, original_file_content_str: str, patch: str) -> str:
|
def load_large_diff(filename, new_file_content_str: str, original_file_content_str: str) -> str:
|
||||||
"""
|
"""
|
||||||
Generate a patch for a modified file by comparing the original content of the file with the new content provided as
|
Generate a patch for a modified file by comparing the original content of the file with the new content provided as
|
||||||
input.
|
input.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file: The file object for which the patch needs to be generated.
|
|
||||||
new_file_content_str: The new content of the file as a string.
|
new_file_content_str: The new content of the file as a string.
|
||||||
original_file_content_str: The original content of the file as a string.
|
original_file_content_str: The original content of the file as a string.
|
||||||
patch: An optional patch string that can be provided as input.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The generated or provided patch string.
|
The generated or provided patch string.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
None.
|
None.
|
||||||
|
|
||||||
Additional Information:
|
|
||||||
- If 'patch' is not provided as input, the function generates a patch using the 'difflib' library and returns it
|
|
||||||
as output.
|
|
||||||
- If the 'settings.config.verbosity_level' is greater than or equal to 2, a warning message is logged indicating
|
|
||||||
that the file was modified but no patch was found, and a patch is manually created.
|
|
||||||
"""
|
"""
|
||||||
if not patch: # to Do - also add condition for file extension
|
patch = ""
|
||||||
try:
|
try:
|
||||||
diff = difflib.unified_diff(original_file_content_str.splitlines(keepends=True),
|
diff = difflib.unified_diff(original_file_content_str.splitlines(keepends=True),
|
||||||
new_file_content_str.splitlines(keepends=True))
|
new_file_content_str.splitlines(keepends=True))
|
||||||
if get_settings().config.verbosity_level >= 2:
|
if get_settings().config.verbosity_level >= 2:
|
||||||
logging.warning(f"File was modified, but no patch was found. Manually creating patch: {file.filename}.")
|
logging.warning(f"File was modified, but no patch was found. Manually creating patch: {filename}.")
|
||||||
patch = ''.join(diff)
|
patch = ''.join(diff)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
@ -27,6 +27,7 @@ class GithubProvider(GitProvider):
|
|||||||
self.pr = None
|
self.pr = None
|
||||||
self.github_user_id = None
|
self.github_user_id = None
|
||||||
self.diff_files = None
|
self.diff_files = None
|
||||||
|
self.git_files = None
|
||||||
self.incremental = incremental
|
self.incremental = incremental
|
||||||
if pr_url:
|
if pr_url:
|
||||||
self.set_pr(pr_url)
|
self.set_pr(pr_url)
|
||||||
@ -81,40 +82,56 @@ class GithubProvider(GitProvider):
|
|||||||
def get_files(self):
|
def get_files(self):
|
||||||
if self.incremental.is_incremental and self.file_set:
|
if self.incremental.is_incremental and self.file_set:
|
||||||
return self.file_set.values()
|
return self.file_set.values()
|
||||||
return self.pr.get_files()
|
if not self.git_files:
|
||||||
|
# bring files from GitHub only once
|
||||||
|
self.git_files = self.pr.get_files()
|
||||||
|
return self.git_files
|
||||||
|
|
||||||
@retry(exceptions=RateLimitExceeded,
|
@retry(exceptions=RateLimitExceeded,
|
||||||
tries=get_settings().github.ratelimit_retries, delay=2, backoff=2, jitter=(1, 3))
|
tries=get_settings().github.ratelimit_retries, delay=2, backoff=2, jitter=(1, 3))
|
||||||
def get_diff_files(self) -> list[FilePatchInfo]:
|
def get_diff_files(self) -> list[FilePatchInfo]:
|
||||||
|
"""
|
||||||
|
Retrieves the list of files that have been modified, added, deleted, or renamed in a pull request in GitHub,
|
||||||
|
along with their content and patch information.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
diff_files (List[FilePatchInfo]): List of FilePatchInfo objects representing the modified, added, deleted,
|
||||||
|
or renamed files in the merge request.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
|
if self.diff_files:
|
||||||
|
return self.diff_files
|
||||||
|
|
||||||
files = self.get_files()
|
files = self.get_files()
|
||||||
diff_files = []
|
diff_files = []
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
if is_valid_file(file.filename):
|
if not is_valid_file(file.filename):
|
||||||
new_file_content_str = self._get_pr_file_content(file, self.pr.head.sha)
|
continue
|
||||||
|
|
||||||
|
new_file_content_str = self._get_pr_file_content(file, self.pr.head.sha) # communication with GitHub
|
||||||
patch = file.patch
|
patch = file.patch
|
||||||
|
|
||||||
if self.incremental.is_incremental and self.file_set:
|
if self.incremental.is_incremental and self.file_set:
|
||||||
original_file_content_str = self._get_pr_file_content(file,
|
original_file_content_str = self._get_pr_file_content(file, self.incremental.last_seen_commit_sha)
|
||||||
self.incremental.last_seen_commit_sha)
|
patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str)
|
||||||
patch = load_large_diff(file,
|
|
||||||
new_file_content_str,
|
|
||||||
original_file_content_str,
|
|
||||||
None)
|
|
||||||
self.file_set[file.filename] = patch
|
self.file_set[file.filename] = patch
|
||||||
else:
|
else:
|
||||||
original_file_content_str = self._get_pr_file_content(file, self.pr.base.sha)
|
original_file_content_str = self._get_pr_file_content(file, self.pr.base.sha)
|
||||||
|
if not patch:
|
||||||
|
patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str)
|
||||||
|
|
||||||
|
diff_files.append(FilePatchInfo(original_file_content_str, new_file_content_str, patch, file.filename))
|
||||||
|
|
||||||
diff_files.append(
|
|
||||||
FilePatchInfo(original_file_content_str, new_file_content_str, patch, file.filename))
|
|
||||||
self.diff_files = diff_files
|
self.diff_files = diff_files
|
||||||
return diff_files
|
return diff_files
|
||||||
|
|
||||||
except GithubException.RateLimitExceededException as e:
|
except GithubException.RateLimitExceededException as e:
|
||||||
logging.error(f"Rate limit exceeded for GitHub API. Original message: {e}")
|
logging.error(f"Rate limit exceeded for GitHub API. Original message: {e}")
|
||||||
raise RateLimitExceeded("Rate limit exceeded for GitHub API.") from e
|
raise RateLimitExceeded("Rate limit exceeded for GitHub API.") from e
|
||||||
|
|
||||||
def publish_description(self, pr_title: str, pr_body: str):
|
def publish_description(self, pr_title: str, pr_body: str):
|
||||||
self.pr.edit(title=pr_title, body=pr_body)
|
self.pr.edit(title=pr_title, body=pr_body)
|
||||||
# self.pr.create_issue_comment(pr_comment)
|
|
||||||
|
|
||||||
def publish_comment(self, pr_comment: str, is_temporary: bool = False):
|
def publish_comment(self, pr_comment: str, is_temporary: bool = False):
|
||||||
if is_temporary and not get_settings().config.publish_output_progress:
|
if is_temporary and not get_settings().config.publish_output_progress:
|
||||||
@ -132,9 +149,9 @@ class GithubProvider(GitProvider):
|
|||||||
self.publish_inline_comments([self.create_inline_comment(body, relevant_file, relevant_line_in_file)])
|
self.publish_inline_comments([self.create_inline_comment(body, relevant_file, relevant_line_in_file)])
|
||||||
|
|
||||||
def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str):
|
def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str):
|
||||||
self.diff_files = self.diff_files if self.diff_files else self.get_diff_files()
|
diff_files = self.get_diff_files()
|
||||||
position = -1
|
position = -1
|
||||||
for file in self.diff_files:
|
for file in diff_files:
|
||||||
if file.filename.strip() == relevant_file:
|
if file.filename.strip() == relevant_file:
|
||||||
patch = file.patch
|
patch = file.patch
|
||||||
patch_lines = patch.splitlines()
|
patch_lines = patch.splitlines()
|
||||||
|
@ -7,6 +7,7 @@ import gitlab
|
|||||||
from gitlab import GitlabGetError
|
from gitlab import GitlabGetError
|
||||||
|
|
||||||
from ..algo.language_handler import is_valid_file
|
from ..algo.language_handler import is_valid_file
|
||||||
|
from ..algo.utils import load_large_diff
|
||||||
from ..config_loader import get_settings
|
from ..config_loader import get_settings
|
||||||
from .git_provider import EDIT_TYPE, FilePatchInfo, GitProvider
|
from .git_provider import EDIT_TYPE, FilePatchInfo, GitProvider
|
||||||
|
|
||||||
@ -30,6 +31,7 @@ class GitLabProvider(GitProvider):
|
|||||||
self.id_mr = None
|
self.id_mr = None
|
||||||
self.mr = None
|
self.mr = None
|
||||||
self.diff_files = None
|
self.diff_files = None
|
||||||
|
self.git_files = None
|
||||||
self.temp_comments = []
|
self.temp_comments = []
|
||||||
self._set_merge_request(merge_request_url)
|
self._set_merge_request(merge_request_url)
|
||||||
self.RE_HUNK_HEADER = re.compile(
|
self.RE_HUNK_HEADER = re.compile(
|
||||||
@ -65,19 +67,27 @@ class GitLabProvider(GitProvider):
|
|||||||
return ''
|
return ''
|
||||||
|
|
||||||
def get_diff_files(self) -> list[FilePatchInfo]:
|
def get_diff_files(self) -> list[FilePatchInfo]:
|
||||||
|
"""
|
||||||
|
Retrieves the list of files that have been modified, added, deleted, or renamed in a pull request in GitLab,
|
||||||
|
along with their content and patch information.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
diff_files (List[FilePatchInfo]): List of FilePatchInfo objects representing the modified, added, deleted,
|
||||||
|
or renamed files in the merge request.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.diff_files:
|
||||||
|
return self.diff_files
|
||||||
|
|
||||||
diffs = self.mr.changes()['changes']
|
diffs = self.mr.changes()['changes']
|
||||||
diff_files = []
|
diff_files = []
|
||||||
for diff in diffs:
|
for diff in diffs:
|
||||||
if is_valid_file(diff['new_path']):
|
if is_valid_file(diff['new_path']):
|
||||||
original_file_content_str = self._get_pr_file_content(diff['old_path'], self.mr.target_branch)
|
# original_file_content_str = self._get_pr_file_content(diff['old_path'], self.mr.target_branch)
|
||||||
new_file_content_str = self._get_pr_file_content(diff['new_path'], self.mr.source_branch)
|
# new_file_content_str = self._get_pr_file_content(diff['new_path'], self.mr.source_branch)
|
||||||
edit_type = EDIT_TYPE.MODIFIED
|
original_file_content_str = self._get_pr_file_content(diff['old_path'], self.mr.diff_refs['base_sha'])
|
||||||
if diff['new_file']:
|
new_file_content_str = self._get_pr_file_content(diff['new_path'], self.mr.diff_refs['head_sha'])
|
||||||
edit_type = EDIT_TYPE.ADDED
|
|
||||||
elif diff['deleted_file']:
|
|
||||||
edit_type = EDIT_TYPE.DELETED
|
|
||||||
elif diff['renamed_file']:
|
|
||||||
edit_type = EDIT_TYPE.RENAMED
|
|
||||||
try:
|
try:
|
||||||
if isinstance(original_file_content_str, bytes):
|
if isinstance(original_file_content_str, bytes):
|
||||||
original_file_content_str = bytes.decode(original_file_content_str, 'utf-8')
|
original_file_content_str = bytes.decode(original_file_content_str, 'utf-8')
|
||||||
@ -86,15 +96,33 @@ class GitLabProvider(GitProvider):
|
|||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"Cannot decode file {diff['old_path']} or {diff['new_path']} in merge request {self.id_mr}")
|
f"Cannot decode file {diff['old_path']} or {diff['new_path']} in merge request {self.id_mr}")
|
||||||
|
|
||||||
|
edit_type = EDIT_TYPE.MODIFIED
|
||||||
|
if diff['new_file']:
|
||||||
|
edit_type = EDIT_TYPE.ADDED
|
||||||
|
elif diff['deleted_file']:
|
||||||
|
edit_type = EDIT_TYPE.DELETED
|
||||||
|
elif diff['renamed_file']:
|
||||||
|
edit_type = EDIT_TYPE.RENAMED
|
||||||
|
|
||||||
|
filename = diff['new_path']
|
||||||
|
patch = diff['diff']
|
||||||
|
if not patch:
|
||||||
|
patch = load_large_diff(filename, new_file_content_str, original_file_content_str)
|
||||||
|
|
||||||
diff_files.append(
|
diff_files.append(
|
||||||
FilePatchInfo(original_file_content_str, new_file_content_str, diff['diff'], diff['new_path'],
|
FilePatchInfo(original_file_content_str, new_file_content_str,
|
||||||
|
patch=patch,
|
||||||
|
filename=filename,
|
||||||
edit_type=edit_type,
|
edit_type=edit_type,
|
||||||
old_filename=None if diff['old_path'] == diff['new_path'] else diff['old_path']))
|
old_filename=None if diff['old_path'] == diff['new_path'] else diff['old_path']))
|
||||||
self.diff_files = diff_files
|
self.diff_files = diff_files
|
||||||
return diff_files
|
return diff_files
|
||||||
|
|
||||||
def get_files(self):
|
def get_files(self):
|
||||||
return [change['new_path'] for change in self.mr.changes()['changes']]
|
if not self.git_files:
|
||||||
|
self.git_files = [change['new_path'] for change in self.mr.changes()['changes']]
|
||||||
|
return self.git_files
|
||||||
|
|
||||||
def publish_description(self, pr_title: str, pr_body: str):
|
def publish_description(self, pr_title: str, pr_body: str):
|
||||||
try:
|
try:
|
||||||
@ -110,7 +138,6 @@ class GitLabProvider(GitProvider):
|
|||||||
self.temp_comments.append(comment)
|
self.temp_comments.append(comment)
|
||||||
|
|
||||||
def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str):
|
def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str):
|
||||||
self.diff_files = self.diff_files if self.diff_files else self.get_diff_files()
|
|
||||||
edit_type, found, source_line_no, target_file, target_line_no = self.search_line(relevant_file,
|
edit_type, found, source_line_no, target_file, target_line_no = self.search_line(relevant_file,
|
||||||
relevant_line_in_file)
|
relevant_line_in_file)
|
||||||
self.send_inline_comment(body, edit_type, found, relevant_file, relevant_line_in_file, source_line_no,
|
self.send_inline_comment(body, edit_type, found, relevant_file, relevant_line_in_file, source_line_no,
|
||||||
@ -151,9 +178,9 @@ class GitLabProvider(GitProvider):
|
|||||||
relevant_lines_start = suggestion['relevant_lines_start']
|
relevant_lines_start = suggestion['relevant_lines_start']
|
||||||
relevant_lines_end = suggestion['relevant_lines_end']
|
relevant_lines_end = suggestion['relevant_lines_end']
|
||||||
|
|
||||||
self.diff_files = self.diff_files if self.diff_files else self.get_diff_files()
|
diff_files = self.get_diff_files()
|
||||||
target_file = None
|
target_file = None
|
||||||
for file in self.diff_files:
|
for file in diff_files:
|
||||||
if file.filename == relevant_file:
|
if file.filename == relevant_file:
|
||||||
if file.filename == relevant_file:
|
if file.filename == relevant_file:
|
||||||
target_file = file
|
target_file = file
|
||||||
@ -180,7 +207,7 @@ class GitLabProvider(GitProvider):
|
|||||||
target_file = None
|
target_file = None
|
||||||
|
|
||||||
edit_type = self.get_edit_type(relevant_line_in_file)
|
edit_type = self.get_edit_type(relevant_line_in_file)
|
||||||
for file in self.diff_files:
|
for file in self.get_diff_files():
|
||||||
if file.filename == relevant_file:
|
if file.filename == relevant_file:
|
||||||
edit_type, found, source_line_no, target_file, target_line_no = self.find_in_file(file,
|
edit_type, found, source_line_no, target_file, target_line_no = self.find_in_file(file,
|
||||||
relevant_line_in_file)
|
relevant_line_in_file)
|
||||||
|
@ -58,12 +58,12 @@ class PRCodeSuggestions:
|
|||||||
|
|
||||||
async def _prepare_prediction(self, model: str):
|
async def _prepare_prediction(self, model: str):
|
||||||
logging.info('Getting PR diff...')
|
logging.info('Getting PR diff...')
|
||||||
# we are using extended hunk with line numbers for code suggestions
|
|
||||||
self.patches_diff = get_pr_diff(self.git_provider,
|
self.patches_diff = get_pr_diff(self.git_provider,
|
||||||
self.token_handler,
|
self.token_handler,
|
||||||
model,
|
model,
|
||||||
add_line_numbers_to_hunks=True,
|
add_line_numbers_to_hunks=True,
|
||||||
disable_extra_lines=True)
|
disable_extra_lines=True)
|
||||||
|
|
||||||
logging.info('Getting AI prediction...')
|
logging.info('Getting AI prediction...')
|
||||||
self.prediction = await self._get_prediction(model)
|
self.prediction = await self._get_prediction(model)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user