full code suggestions

full code suggestions
This commit is contained in:
mrT23
2023-07-15 09:30:50 +03:00
parent 23a249ccdb
commit 4f4989af8c
11 changed files with 382 additions and 13 deletions

View File

@ -108,3 +108,78 @@ def handle_patch_deletions(patch: str, original_file_content_str: str,
logging.info(f"Processing file: {file_name}, hunks were deleted")
patch = patch_new
return patch
def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
# toDO: (maybe remove '-' and '+' from the beginning of the line)
"""
## src/file.ts
--new hunk--
881 line1
882 line2
883 line3
884 line4
885 line6
886 line7
887 + line8
888 + line9
889 line10
890 line11
...
--old hunk--
line1
line2
- line3
- line4
line5
line6
...
"""
patch_with_lines_str = f"## {file.filename}\n"
import re
patch_lines = patch.splitlines()
RE_HUNK_HEADER = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
new_content_lines = []
old_content_lines = []
match = None
start1, size1, start2, size2 = -1, -1, -1, -1
for line in patch_lines:
if 'no newline at end of file' in line.lower():
continue
if line.startswith('@@'):
match = RE_HUNK_HEADER.match(line)
if match and new_content_lines: # found a new hunk, split the previous lines
if new_content_lines:
patch_with_lines_str += '\n--new hunk--\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str += '--old hunk--\n'
for i, line_old in enumerate(old_content_lines):
patch_with_lines_str += f"{line_old}\n"
new_content_lines = []
old_content_lines = []
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif line.startswith('+'):
new_content_lines.append(line)
elif line.startswith('-'):
old_content_lines.append(line)
else:
new_content_lines.append(line)
old_content_lines.append(line)
# finishing last hunk
if match and new_content_lines:
if new_content_lines:
patch_with_lines_str += '\n--new hunk--\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str += '\n--old hunk--\n'
for i, line_old in enumerate(old_content_lines):
patch_with_lines_str += f"{line_old}\n"
return patch_with_lines_str.strip()

View File

@ -4,7 +4,8 @@ import difflib
import logging
from typing import Any, Tuple, Union
from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions
from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions, \
convert_to_hunks_with_lines_numbers
from pr_agent.algo.language_handler import sort_files_by_main_languages
from pr_agent.algo.token_handler import TokenHandler
from pr_agent.config_loader import settings
@ -19,26 +20,33 @@ OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 600
PATCH_EXTRA_LINES = 3
def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler) -> str:
def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler,
add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool =False) -> str:
"""
Returns a string with the diff of the PR.
If needed, apply diff minimization techniques to reduce the number of tokens
"""
if disable_extra_lines:
global PATCH_EXTRA_LINES
PATCH_EXTRA_LINES = 0
git_provider.pr.diff_files = list(git_provider.get_diff_files())
# get pr languages
pr_languages = sort_files_by_main_languages(git_provider.get_languages(), git_provider.pr.diff_files)
# generate a standard diff string, with patch extension
patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler)
patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler,
add_line_numbers_to_hunks)
# if we are under the limit, return the full diff
if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < token_handler.limit:
return "\n".join(patches_extended)
# if we are over the limit, start pruning
patches_compressed, modified_file_names, deleted_file_names = pr_generate_compressed_diff(pr_languages,
token_handler)
patches_compressed, modified_file_names, deleted_file_names = \
pr_generate_compressed_diff(pr_languages, token_handler, add_line_numbers_to_hunks)
final_diff = "\n".join(patches_compressed)
if modified_file_names:
modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names)
@ -49,7 +57,8 @@ def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHa
return final_diff
def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -> \
def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler,
add_line_numbers_to_hunks: bool) -> \
Tuple[list, int]:
"""
Generate a standard diff string, with patch extension
@ -72,6 +81,9 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -
extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES)
full_extended_patch = f"## {file.filename}\n\n{extended_patch}\n"
if add_line_numbers_to_hunks:
full_extended_patch = convert_to_hunks_with_lines_numbers(extended_patch, file)
patch_tokens = token_handler.count_tokens(full_extended_patch)
file.tokens = patch_tokens
total_tokens += patch_tokens
@ -80,7 +92,8 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -
return patches_extended, total_tokens
def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> Tuple[list, list, list]:
def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler,
convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list]:
# Apply Diff Minimization techniques to reduce the number of tokens:
# 0. Start from the largest diff patch to smaller ones
# 1. Don't use extend context lines around diff
@ -114,6 +127,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) ->
deleted_files_list.append(file.filename)
total_tokens += token_handler.count_tokens(file.filename) + 1
continue
if convert_hunks_to_line_numbers:
patch = convert_to_hunks_with_lines_numbers(patch, file)
new_patch_tokens = token_handler.count_tokens(patch)
# Hard Stop, no more tokens
@ -135,7 +152,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) ->
continue
if patch:
patch_final = f"## {file.filename}\n\n{patch}\n"
if not convert_hunks_to_line_numbers:
patch_final = f"## {file.filename}\n\n{patch}\n"
else:
patch_final = patch
patches.append(patch_final)
total_tokens += token_handler.count_tokens(patch_final)
if settings.config.verbosity_level >= 2: