refactor(utils): improve file walkthrough parsing with regex and better error handling

This commit is contained in:
mrT23
2025-07-18 08:54:52 +03:00
parent 754d47f187
commit e8c73e7baa
2 changed files with 80 additions and 58 deletions

View File

@ -1285,14 +1285,35 @@ def process_description(description_full: str) -> Tuple[str, List]:
if not description_full: if not description_full:
return "", [] return "", []
description_split = description_full.split(PRDescriptionHeader.FILE_WALKTHROUGH.value) # description_split = description_full.split(PRDescriptionHeader.FILE_WALKTHROUGH.value)
base_description_str = description_split[0] if PRDescriptionHeader.FILE_WALKTHROUGH.value in description_full:
changes_walkthrough_str = "" try:
files = [] # FILE_WALKTHROUGH are presented in a collapsible section in the description
if len(description_split) > 1: regex_pattern = r'<details.*?>\s*<summary>\s*<h3>\s*' + re.escape(PRDescriptionHeader.FILE_WALKTHROUGH.value) + r'\s*</h3>\s*</summary>'
changes_walkthrough_str = description_split[1] description_split = re.split(regex_pattern, description_full, maxsplit=1, flags=re.DOTALL)
# If the regex pattern is not found, fallback to the previous method
if len(description_split) == 1:
get_logger().debug("Could not find regex pattern for file walkthrough, falling back to simple split")
description_split = description_full.split(PRDescriptionHeader.FILE_WALKTHROUGH.value, 1)
except Exception as e:
get_logger().warning(f"Failed to split description using regex, falling back to simple split: {e}")
description_split = description_full.split(PRDescriptionHeader.FILE_WALKTHROUGH.value, 1)
if len(description_split) < 2:
get_logger().error("Failed to split description into base and changes walkthrough", artifact={'description': description_full})
return description_full.strip(), []
base_description_str = description_split[0].strip()
changes_walkthrough_str = ""
files = []
if len(description_split) > 1:
changes_walkthrough_str = description_split[1]
else:
get_logger().debug("No changes walkthrough found")
else: else:
get_logger().debug("No changes walkthrough found") base_description_str = description_full.strip()
return base_description_str, []
try: try:
if changes_walkthrough_str: if changes_walkthrough_str:
@ -1315,18 +1336,20 @@ def process_description(description_full: str) -> Tuple[str, List]:
try: try:
if isinstance(file_data, tuple): if isinstance(file_data, tuple):
file_data = file_data[0] file_data = file_data[0]
pattern = r'<details>\s*<summary><strong>(.*?)</strong>\s*<dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\s*<li>(.*?)</details>' pattern = r'<details>\s*<summary><strong>(.*?)</strong>\s*<dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\s*(?:<li>|•)(.*?)</details>'
res = re.search(pattern, file_data, re.DOTALL) res = re.search(pattern, file_data, re.DOTALL)
if not res or res.lastindex != 4: if not res or res.lastindex != 4:
pattern_back = r'<details>\s*<summary><strong>(.*?)</strong><dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\n\n\s*(.*?)</details>' pattern_back = r'<details>\s*<summary><strong>(.*?)</strong><dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\n\n\s*(.*?)</details>'
res = re.search(pattern_back, file_data, re.DOTALL) res = re.search(pattern_back, file_data, re.DOTALL)
if not res or res.lastindex != 4: if not res or res.lastindex != 4:
pattern_back = r'<details>\s*<summary><strong>(.*?)</strong>\s*<dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\s*-\s*(.*?)\s*</details>' # looking for hyphen ('- ') pattern_back = r'<details>\s*<summary><strong>(.*?)</strong>\s*<dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\s*-\s*(.*?)\s*</details>' # looking for hypen ('- ')
res = re.search(pattern_back, file_data, re.DOTALL) res = re.search(pattern_back, file_data, re.DOTALL)
if res and res.lastindex == 4: if res and res.lastindex == 4:
short_filename = res.group(1).strip() short_filename = res.group(1).strip()
short_summary = res.group(2).strip() short_summary = res.group(2).strip()
long_filename = res.group(3).strip() long_filename = res.group(3).strip()
if long_filename.endswith('<ul>'):
long_filename = long_filename[:-4].strip()
long_summary = res.group(4).strip() long_summary = res.group(4).strip()
long_summary = long_summary.replace('<br> *', '\n*').replace('<br>','').replace('\n','<br>') long_summary = long_summary.replace('<br> *', '\n*').replace('<br>','').replace('\n','<br>')
long_summary = h.handle(long_summary).strip() long_summary = h.handle(long_summary).strip()
@ -1345,7 +1368,7 @@ def process_description(description_full: str) -> Tuple[str, List]:
if '<code>...</code>' in file_data: if '<code>...</code>' in file_data:
pass # PR with many files. some did not get analyzed pass # PR with many files. some did not get analyzed
else: else:
get_logger().error(f"Failed to parse description", artifact={'description': file_data}) get_logger().warning(f"Failed to parse description", artifact={'description': file_data})
except Exception as e: except Exception as e:
get_logger().exception(f"Failed to process description: {e}", artifact={'description': file_data}) get_logger().exception(f"Failed to process description: {e}", artifact={'description': file_data})

View File

@ -94,9 +94,9 @@ class TestConvertToMarkdown:
input_data = {'review': { input_data = {'review': {
'key_issues_to_review': [ 'key_issues_to_review': [
{ {
'relevant_file' : 'src/utils.py', 'relevant_file': 'src/utils.py',
'issue_header' : 'Code Smell', 'issue_header': 'Code Smell',
'issue_content' : 'The function is too long and complex.', 'issue_content': 'The function is too long and complex.',
'start_line': 30, 'start_line': 30,
'end_line': 50, 'end_line': 50,
} }
@ -179,8 +179,8 @@ class TestConvertToMarkdown:
], ],
'title': 'Bug Fix', 'title': 'Bug Fix',
} }
] ]
} }
} }
expected_output = textwrap.dedent("""\ expected_output = textwrap.dedent("""\
@ -228,7 +228,6 @@ class TestConvertToMarkdown:
expected_output = '' expected_output = ''
assert convert_to_markdown_v2(input_data).strip() == expected_output.strip() assert convert_to_markdown_v2(input_data).strip() == expected_output.strip()
def test_dictionary_with_empty_dictionaries(self): def test_dictionary_with_empty_dictionaries(self):
@ -236,16 +235,16 @@ class TestConvertToMarkdown:
expected_output = '' expected_output = ''
assert convert_to_markdown_v2(input_data).strip() == expected_output.strip() assert convert_to_markdown_v2(input_data).strip() == expected_output.strip()
class TestBR: class TestBR:
def test_br1(self): def test_br1(self):
file_change_description = '- Imported `FilePatchInfo` and `EDIT_TYPE` from `pr_agent.algo.types` instead of `pr_agent.git_providers.git_provider`.' file_change_description = '- Imported `FilePatchInfo` and `EDIT_TYPE` from `pr_agent.algo.types` instead of `pr_agent.git_providers.git_provider`.'
file_change_description_br = insert_br_after_x_chars(file_change_description) file_change_description_br = insert_br_after_x_chars(file_change_description)
expected_output = ('<li>Imported <code>FilePatchInfo</code> and <code>EDIT_TYPE</code> from ' expected_output = ('<ul><li>Imported <code>FilePatchInfo</code> and <code>EDIT_TYPE</code> from '
'<code>pr_agent.algo.types</code> instead <br>of ' '<code>pr_agent.algo.types</code> instead <br>of '
'<code>pr_agent.git_providers.git_provider</code>.') '<code>pr_agent.git_providers.git_provider</code>.</ul>')
assert file_change_description_br == expected_output assert file_change_description_br == expected_output
# print("-----") # print("-----")
# print(file_change_description_br) # print(file_change_description_br)
@ -255,9 +254,9 @@ class TestBR:
'- Created a - new -class `ColorPaletteResourcesCollection ColorPaletteResourcesCollection ' '- Created a - new -class `ColorPaletteResourcesCollection ColorPaletteResourcesCollection '
'ColorPaletteResourcesCollection ColorPaletteResourcesCollection`') 'ColorPaletteResourcesCollection ColorPaletteResourcesCollection`')
file_change_description_br = insert_br_after_x_chars(file_change_description) file_change_description_br = insert_br_after_x_chars(file_change_description)
expected_output = ('<li>Created a - new -class <code>ColorPaletteResourcesCollection </code><br><code>' expected_output = ('<ul><li>Created a - new -class <code>ColorPaletteResourcesCollection </code><br><code>'
'ColorPaletteResourcesCollection ColorPaletteResourcesCollection ' 'ColorPaletteResourcesCollection ColorPaletteResourcesCollection '
'</code><br><code>ColorPaletteResourcesCollection</code>') '</code><br><code>ColorPaletteResourcesCollection</code></ul>')
assert file_change_description_br == expected_output assert file_change_description_br == expected_output
# print("-----") # print("-----")
# print(file_change_description_br) # print(file_change_description_br)