Merge pull request #630 from Codium-ai/tr/language

Enhancements in Patch Formatting and Code Suggestions Handling
This commit is contained in:
Tal
2024-01-29 12:11:23 -08:00
committed by GitHub
7 changed files with 37 additions and 29 deletions

View File

@ -181,7 +181,7 @@ __old hunk__
...
"""
patch_with_lines_str = f"\n\n## {file.filename}\n"
patch_with_lines_str = f"\n\n## file: '{file.filename.strip()}'\n"
patch_lines = patch.splitlines()
RE_HUNK_HEADER = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
@ -202,11 +202,11 @@ __old hunk__
if new_content_lines:
if prev_header_line:
patch_with_lines_str += f'\n{prev_header_line}\n'
patch_with_lines_str += '__new hunk__\n'
patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__new hunk__\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str += '__old hunk__\n'
patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__old hunk__\n'
for line_old in old_content_lines:
patch_with_lines_str += f"{line_old}\n"
new_content_lines = []
@ -236,11 +236,11 @@ __old hunk__
if match and new_content_lines:
if new_content_lines:
patch_with_lines_str += f'\n{header_line}\n'
patch_with_lines_str += '\n__new hunk__\n'
patch_with_lines_str = patch_with_lines_str.rstrip()+ '\n__new hunk__\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str += '\n__old hunk__\n'
patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
for line_old in old_content_lines:
patch_with_lines_str += f"{line_old}\n"

View File

@ -209,9 +209,9 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
if patch:
if not convert_hunks_to_line_numbers:
patch_final = f"## {file.filename}\n\n{patch}\n"
patch_final = f"\n\n## file: '{file.filename.strip()}\n\n{patch.strip()}\n'"
else:
patch_final = patch
patch_final = "\n\n" + patch.strip()
patches.append(patch_final)
total_tokens += token_handler.count_tokens(patch_final)
if get_settings().config.verbosity_level >= 2:
@ -375,6 +375,13 @@ def get_pr_multi_diffs(git_provider: GitProvider,
for lang in pr_languages:
sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True))
# try first a single run with standard diff string, with patch extension, and no deletions
patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff(
pr_languages, token_handler, add_line_numbers_to_hunks=True)
if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < get_max_tokens(model):
return ["\n".join(patches_extended)]
patches = []
final_diff_list = []
total_tokens = token_handler.prompt_tokens

View File

@ -5,7 +5,7 @@ Your task is to generate {{ docs_for_language }} for code components in the PR D
Example for the PR Diff format:
======
## src/file1.py
## file: 'src/file1.py'
@@ -12,3 +12,4 @@ def func1():
__new hunk__
@ -18,7 +18,6 @@ __old hunk__
-code line that was removed in the PR
code line2 that remained unchanged in the PR
@@ ... @@ def func2():
__new hunk__
...
@ -26,7 +25,7 @@ __old hunk__
...
## src/file2.py
## file: 'src/file2.py'
...
======

View File

@ -4,7 +4,7 @@ Your task is to provide meaningful and actionable code suggestions, to improve t
Example for the PR Diff format:
======
## src/file1.py
## file: 'src/file1.py'
@@ ... @@ def func1():
__new hunk__
@ -16,7 +16,6 @@ __old hunk__
-old code line2 that was removed in the PR
code line3 that remained unchanged in the PR
@@ ... @@ def func2():
__new hunk__
...
@ -24,7 +23,7 @@ __old hunk__
...
## src/file2.py
## file: 'src/file2.py'
...
======
@ -51,6 +50,7 @@ The output must be a YAML object equivalent to type $PRCodeSuggestions, accordin
=====
class CodeSuggestion(BaseModel):
relevant_file: str = Field(description="the relevant file full path")
language: str = Field(description="the code language of the relevant file")
suggestion_content: str = Field(description="an actionable suggestion for meaningfully improving the new code introduced in the PR")
{%- if summarize_mode %}
existing_code: str = Field(description="a short code snippet from a '__new hunk__' section to illustrate the relevant existing code. Don't show the line numbers.")
@ -74,6 +74,8 @@ Example output:
code_suggestions:
- relevant_file: |-
src/file1.py
language: |-
python
suggestion_content: |-
Add a docstring to func1()
{%- if summarize_mode %}
@ -105,11 +107,6 @@ user="""PR Info:
Title: '{{title}}'
{%- if language %}
Main PR language: '{{ language }}'
{%- endif %}
The PR Diff:
======

View File

@ -39,6 +39,7 @@ class PRType(str, Enum):
Class FileDescription(BaseModel):
filename: str = Field(description="the relevant file full path")
language: str = Field(description="the relevant file language")
changes_summary: str = Field(description="concise summary of the changes in the relevant file, in bullet points (1-4 bullet points).")
changes_title: str = Field(description="an informative title for the changes in the files, describing its main theme (5-10 words).")
label: str = Field(description="a single semantic label that represents a type of code changes that occurred in the File. Possible values (partial list): 'bug fix', 'tests', 'enhancement', 'documentation', 'error handling', 'configuration changes', 'dependencies', 'formatting', 'miscellaneous', ...")
@ -67,6 +68,8 @@ type:
pr_files:
- filename: |
...
language: |
...
changes_summary: |
...
changes_title: |
@ -104,10 +107,7 @@ Previous description:
{%- endif %}
Branch: '{{branch}}'
{%- if language %}
Main PR language: '{{ language }}'
{%- endif %}
{%- if commit_messages_str %}
Commit messages:

View File

@ -5,7 +5,7 @@ The review should focus on new code added in the PR diff (lines starting with '+
Example PR Diff:
======
## src/file1.py
## file: 'src/file1.py'
@@ -12,5 +12,5 @@ def func1():
code line 1 that remained unchanged in the PR
@ -14,12 +14,11 @@ code line 2 that remained unchanged in the PR
+code line added in the PR
code line 3 that remained unchanged in the PR
@@ ... @@ def func2():
...
## src/file2.py
## file: 'src/file2.py'
...
======
@ -115,6 +114,9 @@ PR Feedback:
relevant file:
type: string
description: the relevant file full path
language:
type: string
description: the language of the relevant file
suggestion:
type: string
description: |-
@ -166,6 +168,8 @@ PR Feedback:
Code feedback:
- relevant file: |-
directory/xxx.py
language: |-
python
suggestion: |-
xxx [important]
relevant line: |-
@ -195,10 +199,6 @@ Description:
======
{%- endif %}
{%- if language %}
Main PR language: '{{ language }}'
{%- endif %}
{%- if commit_messages_str %}
Commit messages:

View File

@ -226,7 +226,7 @@ class PRCodeSuggestions:
for i, patches_diff in enumerate(patches_diff_list):
get_logger().info(f"Processing chunk {i + 1} of {len(patches_diff_list)}")
self.patches_diff = patches_diff
prediction = await self._get_prediction(model)
prediction = await self._get_prediction(model) # toDo: parallelize
prediction_list.append(prediction)
self.prediction_list = prediction_list
@ -253,10 +253,15 @@ class PRCodeSuggestions:
"""
suggestion_list = []
if not data:
return suggestion_list
for suggestion in data:
suggestion_list.append(suggestion)
data_sorted = [[]] * len(suggestion_list)
if len(suggestion_list ) == 1:
return suggestion_list
try:
suggestion_str = ""
for i, suggestion in enumerate(suggestion_list):