refactor(utils): improve file walkthrough parsing with regex and better error handling

2025-07-21 04:50:39 +08:00 · 2025-07-18 08:54:52 +03:00
parent 754d47f187
commit e8c73e7baa
2 changed files with 80 additions and 58 deletions
--- a/pr_agent/algo/utils.py
+++ b/pr_agent/algo/utils.py
@ -1285,14 +1285,35 @@ def process_description(description_full: str) -> Tuple[str, List]:
    if not description_full:
        return "", []

-    description_split = description_full.split(PRDescriptionHeader.FILE_WALKTHROUGH.value)
-    base_description_str = description_split[0]
-    changes_walkthrough_str = ""
-    files = []
-    if len(description_split) > 1:
-        changes_walkthrough_str = description_split[1]
+    # description_split = description_full.split(PRDescriptionHeader.FILE_WALKTHROUGH.value)
+    if PRDescriptionHeader.FILE_WALKTHROUGH.value in description_full:
+        try:
+            # FILE_WALKTHROUGH are presented in a collapsible section in the description
+            regex_pattern = r'<details.*?>\s*<summary>\s*<h3>\s*' + re.escape(PRDescriptionHeader.FILE_WALKTHROUGH.value) + r'\s*</h3>\s*</summary>'
+            description_split = re.split(regex_pattern, description_full, maxsplit=1, flags=re.DOTALL)
+
+            # If the regex pattern is not found, fallback to the previous method
+            if len(description_split) == 1:
+                get_logger().debug("Could not find regex pattern for file walkthrough, falling back to simple split")
+                description_split = description_full.split(PRDescriptionHeader.FILE_WALKTHROUGH.value, 1)
+        except Exception as e:
+            get_logger().warning(f"Failed to split description using regex, falling back to simple split: {e}")
+            description_split = description_full.split(PRDescriptionHeader.FILE_WALKTHROUGH.value, 1)
+
+        if len(description_split) < 2:
+            get_logger().error("Failed to split description into base and changes walkthrough", artifact={'description': description_full})
+            return description_full.strip(), []
+
+        base_description_str = description_split[0].strip()
+        changes_walkthrough_str = ""
+        files = []
+        if len(description_split) > 1:
+            changes_walkthrough_str = description_split[1]
+        else:
+            get_logger().debug("No changes walkthrough found")
    else:
-        get_logger().debug("No changes walkthrough found")
+        base_description_str = description_full.strip()
+        return base_description_str, []

    try:
        if changes_walkthrough_str:
@ -1315,18 +1336,20 @@ def process_description(description_full: str) -> Tuple[str, List]:
                try:
                    if isinstance(file_data, tuple):
                        file_data = file_data[0]
-                    pattern = r'<details>\s*<summary><strong>(.*?)</strong>\s*<dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\s*<li>(.*?)</details>'
+                    pattern = r'<details>\s*<summary><strong>(.*?)</strong>\s*<dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\s*(?:<li>|•)(.*?)</details>'
                    res = re.search(pattern, file_data, re.DOTALL)
                    if not res or res.lastindex != 4:
                        pattern_back = r'<details>\s*<summary><strong>(.*?)</strong><dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\n\n\s*(.*?)</details>'
                        res = re.search(pattern_back, file_data, re.DOTALL)
                    if not res or res.lastindex != 4:
-                        pattern_back = r'<details>\s*<summary><strong>(.*?)</strong>\s*<dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\s*-\s*(.*?)\s*</details>' # looking for hyphen ('- ')
+                        pattern_back = r'<details>\s*<summary><strong>(.*?)</strong>\s*<dd><code>(.*?)</code>.*?</summary>\s*<hr>\s*(.*?)\s*-\s*(.*?)\s*</details>' # looking for hypen ('- ')
                        res = re.search(pattern_back, file_data, re.DOTALL)
                    if res and res.lastindex == 4:
                        short_filename = res.group(1).strip()
                        short_summary = res.group(2).strip()
                        long_filename = res.group(3).strip()
+                        if long_filename.endswith('<ul>'):
+                            long_filename = long_filename[:-4].strip()
                        long_summary =  res.group(4).strip()
                        long_summary = long_summary.replace('<br> *', '\n*').replace('<br>','').replace('\n','<br>')
                        long_summary = h.handle(long_summary).strip()
@ -1345,7 +1368,7 @@ def process_description(description_full: str) -> Tuple[str, List]:
                        if '<code>...</code>' in file_data:
                            pass # PR with many files. some did not get analyzed
                        else:
-                            get_logger().error(f"Failed to parse description", artifact={'description': file_data})
+                            get_logger().warning(f"Failed to parse description", artifact={'description': file_data})
                except Exception as e:
                    get_logger().exception(f"Failed to process description: {e}", artifact={'description': file_data})

--- a/tests/unittest/test_convert_to_markdown.py
+++ b/tests/unittest/test_convert_to_markdown.py
@ -51,7 +51,7 @@ class TestConvertToMarkdown:
        input_data = {'review': {
            'estimated_effort_to_review_[1-5]': '1, because the changes are minimal and straightforward, focusing on a single functionality addition.\n',
            'relevant_tests': 'No\n', 'possible_issues': 'No\n', 'security_concerns': 'No\n'}}
-        
+
        expected_output = textwrap.dedent(f"""\
            {PRReviewHeader.REGULAR.value} 🔍

@ -67,12 +67,12 @@ class TestConvertToMarkdown:
        """)

        assert convert_to_markdown_v2(input_data).strip() == expected_output.strip()
-    
+
    def test_simple_dictionary_input_without_gfm_supported(self):
        input_data = {'review': {
            'estimated_effort_to_review_[1-5]': '1, because the changes are minimal and straightforward, focusing on a single functionality addition.\n',
            'relevant_tests': 'No\n', 'possible_issues': 'No\n', 'security_concerns': 'No\n'}}
-        
+
        expected_output = textwrap.dedent("""\
            ## PR Reviewer Guide 🔍

@ -89,74 +89,74 @@ class TestConvertToMarkdown:
        """)

        assert convert_to_markdown_v2(input_data, gfm_supported=False).strip() == expected_output.strip()
-    
+
    def test_key_issues_to_review(self):
        input_data = {'review': {
            'key_issues_to_review': [
                {
-                    'relevant_file' : 'src/utils.py',
-                    'issue_header' : 'Code Smell',
-                    'issue_content' : 'The function is too long and complex.',
+                    'relevant_file': 'src/utils.py',
+                    'issue_header': 'Code Smell',
+                    'issue_content': 'The function is too long and complex.',
                    'start_line': 30,
                    'end_line': 50,
                }
            ]
        }}
        mock_git_provider = Mock()
-        reference_link = 'https://github.com/qodo/pr-agent/pull/1/files#diff-hashvalue-R174' 
+        reference_link = 'https://github.com/qodo/pr-agent/pull/1/files#diff-hashvalue-R174'
        mock_git_provider.get_line_link.return_value = reference_link

        expected_output = textwrap.dedent(f"""\
            ## PR Reviewer Guide 🔍
-                                          
+
            Here are some key observations to aid the review process:
-                                          
+
            <table>
            <tr><td>⚡&nbsp;<strong>Recommended focus areas for review</strong><br><br>
-                                                                                    
+
            <a href='{reference_link}'><strong>Code Smell</strong></a><br>The function is too long and complex.
-                                          
+
            </td></tr>
            </table>
        """)
-        
+
        assert convert_to_markdown_v2(input_data, git_provider=mock_git_provider).strip() == expected_output.strip()
        mock_git_provider.get_line_link.assert_called_with('src/utils.py', 30, 50)
-    
+
    def test_ticket_compliance(self):
        input_data = {'review': {
            'ticket_compliance_check': [
                {
                    'ticket_url': 'https://example.com/ticket/123',
-                    'ticket_requirements': '- Requirement 1\n- Requirement 2\n', 
-                    'fully_compliant_requirements': '- Requirement 1\n- Requirement 2\n', 
+                    'ticket_requirements': '- Requirement 1\n- Requirement 2\n',
+                    'fully_compliant_requirements': '- Requirement 1\n- Requirement 2\n',
                    'not_compliant_requirements': '',
                    'requires_further_human_verification': '',
                }
            ]
        }}
-        
+
        expected_output = textwrap.dedent("""\
            ## PR Reviewer Guide 🔍
-            
+
            Here are some key observations to aid the review process:
-                                          
+
            <table>
            <tr><td>
-                                          
+
            **🎫 Ticket compliance analysis ✅**
-                                          
-                                          
-                                          
+
+
+
            **[123](https://example.com/ticket/123) - Fully compliant**
-                                          
+
            Compliant requirements:
-                                          
+
            - Requirement 1
            - Requirement 2
-                                          
-                                          
-                                          
+
+
+
            </td></tr>
            </table>
        """)
@ -179,43 +179,43 @@ class TestConvertToMarkdown:
                    ],
                    'title': 'Bug Fix',
                }
-             ]
-            }
+            ]
+        }
        }

        expected_output = textwrap.dedent("""\
            ## PR Reviewer Guide 🔍
-                                          
+
            Here are some key observations to aid the review process:

            <table>
            <tr><td>🔀 <strong>Multiple PR themes</strong><br><br>
-                                          
+
            <details><summary>
            Sub-PR theme: <b>Refactoring</b></summary>
-                                          
+
            ___
-                                          
+
            Relevant files:
-                                          
+
            - src/file1.py
            - src/file2.py
            ___
-                                          
+
            </details>
-                                          
+
            <details><summary>
            Sub-PR theme: <b>Bug Fix</b></summary>
-                                          
+
            ___
-                                          
+
            Relevant files:
-                                          
+
            - src/file3.py
            ___
-                                          
+
            </details>
-                                          
+
            </td></tr>
            </table>
        """)
@ -228,7 +228,6 @@ class TestConvertToMarkdown:

        expected_output = ''

-
        assert convert_to_markdown_v2(input_data).strip() == expected_output.strip()

    def test_dictionary_with_empty_dictionaries(self):
@ -236,16 +235,16 @@ class TestConvertToMarkdown:

        expected_output = ''

-
        assert convert_to_markdown_v2(input_data).strip() == expected_output.strip()

+
 class TestBR:
    def test_br1(self):
        file_change_description = '- Imported `FilePatchInfo` and `EDIT_TYPE` from `pr_agent.algo.types` instead of `pr_agent.git_providers.git_provider`.'
        file_change_description_br = insert_br_after_x_chars(file_change_description)
-        expected_output = ('<li>Imported <code>FilePatchInfo</code> and <code>EDIT_TYPE</code> from '
+        expected_output = ('<ul><li>Imported <code>FilePatchInfo</code> and <code>EDIT_TYPE</code> from '
                           '<code>pr_agent.algo.types</code> instead <br>of '
-                           '<code>pr_agent.git_providers.git_provider</code>.')
+                           '<code>pr_agent.git_providers.git_provider</code>.</ul>')
        assert file_change_description_br == expected_output
        # print("-----")
        # print(file_change_description_br)
@ -255,9 +254,9 @@ class TestBR:
            '- Created a - new -class `ColorPaletteResourcesCollection ColorPaletteResourcesCollection '
            'ColorPaletteResourcesCollection ColorPaletteResourcesCollection`')
        file_change_description_br = insert_br_after_x_chars(file_change_description)
-        expected_output = ('<li>Created a - new -class <code>ColorPaletteResourcesCollection </code><br><code>'
+        expected_output = ('<ul><li>Created a - new -class <code>ColorPaletteResourcesCollection </code><br><code>'
                           'ColorPaletteResourcesCollection ColorPaletteResourcesCollection '
-                           '</code><br><code>ColorPaletteResourcesCollection</code>')
+                           '</code><br><code>ColorPaletteResourcesCollection</code></ul>')
        assert file_change_description_br == expected_output
        # print("-----")
        # print(file_change_description_br)