From 212c72eb7dab197c48fac75e1eb2d385dc27326d Mon Sep 17 00:00:00 2001
From: Thomas De Keulenaer <11250711+twdkeule@users.noreply.github.com>
Date: Wed, 7 May 2025 16:02:54 +0200
Subject: [PATCH 01/15] Changelog prompt: fix markdown link

---
 pr_agent/settings/pr_update_changelog_prompts.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pr_agent/settings/pr_update_changelog_prompts.toml b/pr_agent/settings/pr_update_changelog_prompts.toml
index 6825da66..ecf610b4 100644
--- a/pr_agent/settings/pr_update_changelog_prompts.toml
+++ b/pr_agent/settings/pr_update_changelog_prompts.toml
@@ -6,7 +6,7 @@ Your task is to add a brief summary of this PR's changes to CHANGELOG.md file of
 - Be general, and avoid specific details, files, etc. The output should be minimal, no more than 3-4 short lines.
 - Write only the new content to be added to CHANGELOG.md, without any introduction or summary. The content should appear as if it's a natural part of the existing file.
 {%- if pr_link %}
-- If relevant, convert the changelog main header into a clickable link using the PR URL '{{ pr_link }}'. Format: header [*][pr_link]
+- If relevant, convert the changelog main header into a clickable link using the PR URL '{{ pr_link }}'. Format: header [*](pr_link)
 {%- endif %}
 
 

From e516d66c1c4a6eb93c31d149b7a9004a5d708de8 Mon Sep 17 00:00:00 2001
From: Thomas De Keulenaer <11250711+twdkeule@users.noreply.github.com>
Date: Fri, 9 May 2025 11:58:24 +0200
Subject: [PATCH 02/15] Azure: return Comment object when creating comment

---
 .../git_providers/azuredevops_provider.py     | 33 ++++++++-----------
 pr_agent/git_providers/git_provider.py        |  9 +++--
 2 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/pr_agent/git_providers/azuredevops_provider.py b/pr_agent/git_providers/azuredevops_provider.py
index 80bf68c5..7524896c 100644
--- a/pr_agent/git_providers/azuredevops_provider.py
+++ b/pr_agent/git_providers/azuredevops_provider.py
@@ -18,14 +18,10 @@ ADO_APP_CLIENT_DEFAULT_ID = "499b84ac-1321-427f-aa17-267ca6975798/.default"
 MAX_PR_DESCRIPTION_AZURE_LENGTH = 4000-1
 
 try:
-    # noinspection PyUnresolvedReferences
     # noinspection PyUnresolvedReferences
     from azure.devops.connection import Connection
     # noinspection PyUnresolvedReferences
-    from azure.devops.v7_1.git.models import (Comment, CommentThread,
-                                              GitPullRequest,
-                                              GitPullRequestIterationChanges,
-                                              GitVersionDescriptor)
+    from azure.devops.released.git import (Comment, CommentThread, GitPullRequest, GitVersionDescriptor, GitClient)
     # noinspection PyUnresolvedReferences
     from azure.identity import DefaultAzureCredential
     from msrest.authentication import BasicAuthentication
@@ -121,31 +117,29 @@ class AzureDevopsProvider(GitProvider):
                 get_logger().warning(f"Azure failed to publish code suggestion, error: {e}")
         return True
 
-
-
     def get_pr_description_full(self) -> str:
         return self.pr.description
 
-    def edit_comment(self, comment, body: str):
+    def edit_comment(self, comment: Comment, body: str):
         try:
             self.azure_devops_client.update_comment(
                 repository_id=self.repo_slug,
                 pull_request_id=self.pr_num,
-                thread_id=comment["thread_id"],
-                comment_id=comment["comment_id"],
+                thread_id=comment.thread_id,
+                comment_id=comment.id,
                 comment=Comment(content=body),
                 project=self.workspace_slug,
             )
         except Exception as e:
             get_logger().exception(f"Failed to edit comment, error: {e}")
 
-    def remove_comment(self, comment):
+    def remove_comment(self, comment: Comment):
         try:
             self.azure_devops_client.delete_comment(
                 repository_id=self.repo_slug,
                 pull_request_id=self.pr_num,
-                thread_id=comment["thread_id"],
-                comment_id=comment["comment_id"],
+                thread_id=comment.thread_id,
+                comment_id=comment.id,
                 project=self.workspace_slug,
             )
         except Exception as e:
@@ -378,7 +372,7 @@ class AzureDevopsProvider(GitProvider):
             get_logger().exception(f"Failed to get diff files, error: {e}")
             return []
 
-    def publish_comment(self, pr_comment: str, is_temporary: bool = False, thread_context=None):
+    def publish_comment(self, pr_comment: str, is_temporary: bool = False, thread_context=None) -> Comment:
         if is_temporary and not get_settings().config.publish_output_progress:
             get_logger().debug(f"Skipping publish_comment for temporary comment: {pr_comment}")
             return None
@@ -390,10 +384,11 @@ class AzureDevopsProvider(GitProvider):
             repository_id=self.repo_slug,
             pull_request_id=self.pr_num,
         )
-        response = {"thread_id": thread_response.id, "comment_id": thread_response.comments[0].id}
+        created_comment = thread_response.comments[0]
+        created_comment.thread_id = thread_response.id
         if is_temporary:
-            self.temp_comments.append(response)
-        return response
+            self.temp_comments.append(created_comment)
+        return created_comment
 
     def publish_description(self, pr_title: str, pr_body: str):
         if len(pr_body) > MAX_PR_DESCRIPTION_AZURE_LENGTH:
@@ -522,7 +517,7 @@ class AzureDevopsProvider(GitProvider):
     def get_user_id(self):
         return 0
 
-    def get_issue_comments(self):
+    def get_issue_comments(self) -> list[Comment]:
         threads = self.azure_devops_client.get_threads(repository_id=self.repo_slug, pull_request_id=self.pr_num, project=self.workspace_slug)
         threads.reverse()
         comment_list = []
@@ -562,7 +557,7 @@ class AzureDevopsProvider(GitProvider):
         return workspace_slug, repo_slug, pr_number
 
     @staticmethod
-    def _get_azure_devops_client():
+    def _get_azure_devops_client() -> GitClient:
         org = get_settings().azure_devops.get("org", None)
         pat = get_settings().azure_devops.get("pat", None)
 
diff --git a/pr_agent/git_providers/git_provider.py b/pr_agent/git_providers/git_provider.py
index 2895bd55..dfb5b224 100644
--- a/pr_agent/git_providers/git_provider.py
+++ b/pr_agent/git_providers/git_provider.py
@@ -228,7 +228,7 @@ class GitProvider(ABC):
                                    update_header: bool = True,
                                    name='review',
                                    final_update_message=True):
-        self.publish_comment(pr_comment)
+        return self.publish_comment(pr_comment)
 
     def publish_persistent_comment_full(self, pr_comment: str,
                                    initial_header: str,
@@ -250,14 +250,13 @@ class GitProvider(ABC):
                     # response = self.mr.notes.update(comment.id, {'body': pr_comment_updated})
                     self.edit_comment(comment, pr_comment_updated)
                     if final_update_message:
-                        self.publish_comment(
+                        return self.publish_comment(
                             f"**[Persistent {name}]({comment_url})** updated to latest commit {latest_commit_url}")
-                    return
+                    return comment
         except Exception as e:
             get_logger().exception(f"Failed to update persistent review, error: {e}")
             pass
-        self.publish_comment(pr_comment)
-
+        return self.publish_comment(pr_comment)
 
     @abstractmethod
     def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None):

From c924affebc3e85e15d64dcd47c8c1591ff7f9ec1 Mon Sep 17 00:00:00 2001
From: Thomas De Keulenaer <11250711+twdkeule@users.noreply.github.com>
Date: Wed, 7 May 2025 14:46:08 +0200
Subject: [PATCH 03/15] Azure devops provider: add persistent comment

---
 .../git_providers/azuredevops_provider.py     | 22 ++++++++++++++-----
 pr_agent/tools/pr_code_suggestions.py         |  8 -------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/pr_agent/git_providers/azuredevops_provider.py b/pr_agent/git_providers/azuredevops_provider.py
index 7524896c..4dd61e5b 100644
--- a/pr_agent/git_providers/azuredevops_provider.py
+++ b/pr_agent/git_providers/azuredevops_provider.py
@@ -170,10 +170,6 @@ class AzureDevopsProvider(GitProvider):
             return []
 
     def is_supported(self, capability: str) -> bool:
-        if capability in [
-            "get_issue_comments",
-        ]:
-            return False
         return True
 
     def set_pr(self, pr_url: str):
@@ -390,6 +386,13 @@ class AzureDevopsProvider(GitProvider):
             self.temp_comments.append(created_comment)
         return created_comment
 
+    def publish_persistent_comment(self, pr_comment: str,
+                                   initial_header: str,
+                                   update_header: bool = True,
+                                   name='review',
+                                   final_update_message=True):
+        return self.publish_persistent_comment_full(pr_comment, initial_header, update_header, name, final_update_message)
+
     def publish_description(self, pr_title: str, pr_body: str):
         if len(pr_body) > MAX_PR_DESCRIPTION_AZURE_LENGTH:
 
@@ -433,7 +436,6 @@ class AzureDevopsProvider(GitProvider):
     def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None):
         self.publish_inline_comments([self.create_inline_comment(body, relevant_file, relevant_line_in_file)])
 
-
     def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str,
                               absolute_position: int = None):
         position, absolute_position = find_line_number_of_relevant_line_in_file(self.get_diff_files(),
@@ -617,3 +619,13 @@ class AzureDevopsProvider(GitProvider):
 
     def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str:
         return self.pr_url+f"?_a=files&path={relevant_file}"
+
+    def get_comment_url(self, comment) -> str:
+        return self.pr_url + "?discussionId=" + str(comment.thread_id)
+
+    def get_latest_commit_url(self) -> str:
+        commits = self.azure_devops_client.get_pull_request_commits(self.repo_slug, self.pr_num, self.workspace_slug)
+        last = commits[0]
+        url = self.azure_devops_client.normalized_url + "/" + self.workspace_slug + "/_git/" + self.repo_slug + "/commit/" + last.commit_id
+        return url
+    
\ No newline at end of file
diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py
index c742aa06..0dea2e70 100644
--- a/pr_agent/tools/pr_code_suggestions.py
+++ b/pr_agent/tools/pr_code_suggestions.py
@@ -267,14 +267,6 @@ class PRCodeSuggestions:
                 up_to_commit_txt = f" up to commit {match.group(0)[4:-3].strip()}"
             return up_to_commit_txt
 
-        if isinstance(git_provider, AzureDevopsProvider): # get_latest_commit_url is not supported yet
-            if progress_response:
-                git_provider.edit_comment(progress_response, pr_comment)
-                new_comment = progress_response
-            else:
-                new_comment = git_provider.publish_comment(pr_comment)
-            return new_comment
-
         history_header = f"#### Previous suggestions\n"
         last_commit_num = git_provider.get_latest_commit_url().split('/')[-1][:7]
         if only_fold: # A user clicked on the 'self-review' checkbox

From 3a07b55d0c0dcb3ae2299caee04139f2493620a8 Mon Sep 17 00:00:00 2001
From: Thomas De Keulenaer <11250711+twdkeule@users.noreply.github.com>
Date: Wed, 7 May 2025 16:38:11 +0200
Subject: [PATCH 04/15] Azure: dont start threads as active because they block
 the pull request

---
 pr_agent/git_providers/azuredevops_provider.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pr_agent/git_providers/azuredevops_provider.py b/pr_agent/git_providers/azuredevops_provider.py
index 4dd61e5b..39e971ec 100644
--- a/pr_agent/git_providers/azuredevops_provider.py
+++ b/pr_agent/git_providers/azuredevops_provider.py
@@ -373,7 +373,7 @@ class AzureDevopsProvider(GitProvider):
             get_logger().debug(f"Skipping publish_comment for temporary comment: {pr_comment}")
             return None
         comment = Comment(content=pr_comment)
-        thread = CommentThread(comments=[comment], thread_context=thread_context, status=1)
+        thread = CommentThread(comments=[comment], thread_context=thread_context, status="closed")
         thread_response = self.azure_devops_client.create_thread(
             comment_thread=thread,
             project=self.workspace_slug,

From 67272700a6bf0b6ad932edc04bde687a0a96f6cc Mon Sep 17 00:00:00 2001
From: Thomas De Keulenaer <11250711+twdkeule@users.noreply.github.com>
Date: Fri, 9 May 2025 09:07:16 +0200
Subject: [PATCH 05/15] Azure: handle line comments

---
 .../git_providers/azuredevops_provider.py     | 27 +++++++++++++
 .../servers/azuredevops_server_webhook.py     | 40 +++++++++++--------
 2 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/pr_agent/git_providers/azuredevops_provider.py b/pr_agent/git_providers/azuredevops_provider.py
index 39e971ec..426953ae 100644
--- a/pr_agent/git_providers/azuredevops_provider.py
+++ b/pr_agent/git_providers/azuredevops_provider.py
@@ -117,6 +117,10 @@ class AzureDevopsProvider(GitProvider):
                 get_logger().warning(f"Azure failed to publish code suggestion, error: {e}")
         return True
 
+    def reply_to_comment_from_comment_id(self, comment_id: int, body: str, is_temporary: bool = False) -> Comment:
+        # comment_id is actually thread_id
+        return self.reply_to_thread(comment_id, body, is_temporary)
+
     def get_pr_description_full(self) -> str:
         return self.pr.description
 
@@ -537,6 +541,29 @@ class AzureDevopsProvider(GitProvider):
     def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool:
         return True
 
+    def set_like(self, thread_id: int, comment_id: int, create: bool = True):
+        if create:
+            self.azure_devops_client.create_like(self.repo_slug, self.pr_num, thread_id, comment_id, project=self.workspace_slug)
+        else:
+            self.azure_devops_client.delete_like(self.repo_slug, self.pr_num, thread_id, comment_id, project=self.workspace_slug)
+            
+    def set_thread_status(self, thread_id: int, status: str):
+        try:
+            self.azure_devops_client.update_thread(CommentThread(status=status), self.repo_slug, self.pr_num, thread_id, self.workspace_slug)
+        except Exception as e:
+            get_logger().exception(f"Failed to set thread status, error: {e}")
+            
+    def reply_to_thread(self, thread_id: int, body: str, is_temporary: bool = False) -> Comment:
+        try:
+            comment = Comment(content=body)
+            response = self.azure_devops_client.create_comment(comment, self.repo_slug, self.pr_num, thread_id, self.workspace_slug)
+            response.thread_id = thread_id
+            if is_temporary:
+                self.temp_comments.append(response)
+            return response
+        except Exception as e:
+            get_logger().exception(f"Failed to reply to thread, error: {e}")
+    
     @staticmethod
     def _parse_pr_url(pr_url: str) -> Tuple[str, str, int]:
         parsed_url = urlparse(pr_url)
diff --git a/pr_agent/servers/azuredevops_server_webhook.py b/pr_agent/servers/azuredevops_server_webhook.py
index bb97b839..3a03250b 100644
--- a/pr_agent/servers/azuredevops_server_webhook.py
+++ b/pr_agent/servers/azuredevops_server_webhook.py
@@ -22,6 +22,7 @@ from starlette_context.middleware import RawContextMiddleware
 from pr_agent.agent.pr_agent import PRAgent, command2class
 from pr_agent.algo.utils import update_settings_from_args
 from pr_agent.config_loader import get_settings
+from pr_agent.git_providers import get_git_provider_with_context
 from pr_agent.git_providers.utils import apply_repo_settings
 from pr_agent.log import LoggingFormat, get_logger, setup_logger
 
@@ -33,14 +34,18 @@ azure_devops_server = get_settings().get("azure_devops_server")
 WEBHOOK_USERNAME = azure_devops_server.get("webhook_username")
 WEBHOOK_PASSWORD = azure_devops_server.get("webhook_password")
 
-async def handle_request_comment( url: str, body: str, log_context: dict
-):
+async def handle_request_comment(url: str, body: str, thread_id: int, comment_id: int, log_context: dict):
     log_context["action"] = body
     log_context["api_url"] = url
-
     try:
         with get_logger().contextualize(**log_context):
-            await PRAgent().handle_request(url, body)
+            agent = PRAgent()
+            provider = get_git_provider_with_context(pr_url=url)
+            handled = await agent.handle_request(url, body, notify=lambda: provider.reply_to_thread(thread_id, "On it! ⏳", True))
+            # mark command comment as closed
+            if handled:
+                provider.set_thread_status(thread_id, "closed")
+                provider.remove_initial_comment()
     except Exception as e:
         get_logger().exception(f"Failed to handle webhook", artifact={"url": url, "body": body}, error=str(e))
 
@@ -83,7 +88,6 @@ async def _perform_commands_azure(commands_conf: str, agent: PRAgent, api_url: s
 
 
 async def handle_request_azure(data, log_context):
-    actions = []
     if data["eventType"] == "git.pullrequest.created":
         # API V1 (latest)
         pr_url = unquote(data["resource"]["_links"]["web"]["href"].replace("_apis/git/repositories", "_git"))
@@ -95,11 +99,16 @@ async def handle_request_azure(data, log_context):
             content=jsonable_encoder({"message": "webhook triggered successfully"})
         )
     elif data["eventType"] == "ms.vss-code.git-pullrequest-comment-event" and "content" in data["resource"]["comment"]:
-        if available_commands_rgx.match(data["resource"]["comment"]["content"]):
+        comment = data["resource"]["comment"]
+        if available_commands_rgx.match(comment["content"]):
             if(data["resourceVersion"] == "2.0"):
                 repo = data["resource"]["pullRequest"]["repository"]["webUrl"]
                 pr_url = unquote(f'{repo}/pullrequest/{data["resource"]["pullRequest"]["pullRequestId"]}')
-                actions = [data["resource"]["comment"]["content"]]
+                action = comment["content"]
+                thread_url = comment["_links"]["threads"]["href"]
+                thread_id = int(thread_url.split("/")[-1])
+                comment_id = int(comment["id"])
+                pass
             else:
                 # API V1 not supported as it does not contain the PR URL
                 return JSONResponse(
@@ -119,15 +128,14 @@ async def handle_request_azure(data, log_context):
     log_context["event"] = data["eventType"]
     log_context["api_url"] = pr_url
 
-    for action in actions:
-        try:
-            await handle_request_comment(pr_url, action, log_context)
-        except Exception as e:
-            get_logger().error("Azure DevOps Trigger failed. Error:" + str(e))
-            return JSONResponse(
-                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                content=json.dumps({"message": "Internal server error"}),
-            )
+    try:
+        await handle_request_comment(pr_url, action, thread_id, comment_id, log_context)
+    except Exception as e:
+        get_logger().error("Azure DevOps Trigger failed. Error:" + str(e))
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content=json.dumps({"message": "Internal server error"}),
+        )
     return JSONResponse(
         status_code=status.HTTP_202_ACCEPTED, content=jsonable_encoder({"message": "webhook triggered successfully"})
     )

From 24a90cab8e91475c3dce65aade65c73892b2e611 Mon Sep 17 00:00:00 2001
From: Thomas De Keulenaer <11250711+twdkeule@users.noreply.github.com>
Date: Fri, 9 May 2025 12:13:42 +0200
Subject: [PATCH 06/15] Azure: handle inline /ask

---
 .../git_providers/azuredevops_provider.py     |  9 ++++++-
 .../servers/azuredevops_server_webhook.py     | 25 +++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/pr_agent/git_providers/azuredevops_provider.py b/pr_agent/git_providers/azuredevops_provider.py
index 426953ae..d77efc27 100644
--- a/pr_agent/git_providers/azuredevops_provider.py
+++ b/pr_agent/git_providers/azuredevops_provider.py
@@ -21,7 +21,7 @@ try:
     # noinspection PyUnresolvedReferences
     from azure.devops.connection import Connection
     # noinspection PyUnresolvedReferences
-    from azure.devops.released.git import (Comment, CommentThread, GitPullRequest, GitVersionDescriptor, GitClient)
+    from azure.devops.released.git import (Comment, CommentThread, GitPullRequest, GitVersionDescriptor, GitClient, CommentThreadContext)
     # noinspection PyUnresolvedReferences
     from azure.identity import DefaultAzureCredential
     from msrest.authentication import BasicAuthentication
@@ -564,6 +564,13 @@ class AzureDevopsProvider(GitProvider):
         except Exception as e:
             get_logger().exception(f"Failed to reply to thread, error: {e}")
     
+    def get_thread_context(self, thread_id: int) -> CommentThreadContext:
+        try:
+            thread = self.azure_devops_client.get_pull_request_thread(self.repo_slug, self.pr_num, thread_id, self.workspace_slug)
+            return thread.thread_context
+        except Exception as e:
+            get_logger().exception(f"Failed to set thread status, error: {e}")
+    
     @staticmethod
     def _parse_pr_url(pr_url: str) -> Tuple[str, str, int]:
         parsed_url = urlparse(pr_url)
diff --git a/pr_agent/servers/azuredevops_server_webhook.py b/pr_agent/servers/azuredevops_server_webhook.py
index 3a03250b..45533385 100644
--- a/pr_agent/servers/azuredevops_server_webhook.py
+++ b/pr_agent/servers/azuredevops_server_webhook.py
@@ -23,6 +23,7 @@ from pr_agent.agent.pr_agent import PRAgent, command2class
 from pr_agent.algo.utils import update_settings_from_args
 from pr_agent.config_loader import get_settings
 from pr_agent.git_providers import get_git_provider_with_context
+from pr_agent.git_providers.azuredevops_provider import AzureDevopsProvider
 from pr_agent.git_providers.utils import apply_repo_settings
 from pr_agent.log import LoggingFormat, get_logger, setup_logger
 
@@ -41,6 +42,7 @@ async def handle_request_comment(url: str, body: str, thread_id: int, comment_id
         with get_logger().contextualize(**log_context):
             agent = PRAgent()
             provider = get_git_provider_with_context(pr_url=url)
+            body = handle_line_comment(body, thread_id, provider)
             handled = await agent.handle_request(url, body, notify=lambda: provider.reply_to_thread(thread_id, "On it! ⏳", True))
             # mark command comment as closed
             if handled:
@@ -49,6 +51,29 @@ async def handle_request_comment(url: str, body: str, thread_id: int, comment_id
     except Exception as e:
         get_logger().exception(f"Failed to handle webhook", artifact={"url": url, "body": body}, error=str(e))
 
+def handle_line_comment(body: str, thread_id: int, provider: AzureDevopsProvider):
+    body = body.strip()
+    if not body.startswith('/ask '):
+        return body
+    thread_context = provider.get_thread_context(thread_id)
+    if not thread_context:
+        return body
+    
+    path = thread_context.file_path
+    if thread_context.left_file_end or thread_context.left_file_start:
+        start_line = thread_context.left_file_start.line
+        end_line = thread_context.left_file_end.line
+        side = "left"
+    elif thread_context.right_file_end or thread_context.right_file_start:
+        start_line = thread_context.right_file_start.line
+        end_line = thread_context.right_file_end.line
+        side = "right"
+    else:
+        get_logger().info("No line range found in thread context", artifact={"thread_context": thread_context})
+        return body
+    
+    question = body[5:].lstrip() # remove 4 chars: '/ask '
+    return f"/ask_line --line_start={start_line} --line_end={end_line} --side={side} --file_name={path} --comment_id={thread_id} {question}"
 
 # currently only basic auth is supported with azure webhooks
 # for this reason, https must be enabled to ensure the credentials are not sent in clear text

From 954d61e5dc232d9151a9820901b4eebff68ed949 Mon Sep 17 00:00:00 2001
From: Thomas De Keulenaer <11250711+twdkeule@users.noreply.github.com>
Date: Fri, 9 May 2025 13:12:47 +0200
Subject: [PATCH 07/15] Azure:  refactor publish_code_suggestions() to use
 azure classes

---
 .../git_providers/azuredevops_provider.py     | 43 ++++---------------
 1 file changed, 8 insertions(+), 35 deletions(-)

diff --git a/pr_agent/git_providers/azuredevops_provider.py b/pr_agent/git_providers/azuredevops_provider.py
index d77efc27..35165bdd 100644
--- a/pr_agent/git_providers/azuredevops_provider.py
+++ b/pr_agent/git_providers/azuredevops_provider.py
@@ -21,7 +21,7 @@ try:
     # noinspection PyUnresolvedReferences
     from azure.devops.connection import Connection
     # noinspection PyUnresolvedReferences
-    from azure.devops.released.git import (Comment, CommentThread, GitPullRequest, GitVersionDescriptor, GitClient, CommentThreadContext)
+    from azure.devops.released.git import (Comment, CommentThread, GitPullRequest, GitVersionDescriptor, GitClient, CommentThreadContext, CommentPosition)
     # noinspection PyUnresolvedReferences
     from azure.identity import DefaultAzureCredential
     from msrest.authentication import BasicAuthentication
@@ -73,40 +73,13 @@ class AzureDevopsProvider(GitProvider):
                                        f"relevant_lines_start is {relevant_lines_start}")
                 continue
 
-            if relevant_lines_end > relevant_lines_start:
-                post_parameters = {
-                    "body": body,
-                    "path": relevant_file,
-                    "line": relevant_lines_end,
-                    "start_line": relevant_lines_start,
-                    "start_side": "RIGHT",
-                }
-            else:  # API is different for single line comments
-                post_parameters = {
-                    "body": body,
-                    "path": relevant_file,
-                    "line": relevant_lines_start,
-                    "side": "RIGHT",
-                }
-            post_parameters_list.append(post_parameters)
-        if not post_parameters_list:
-            return False
-
-        for post_parameters in post_parameters_list:
+            thread_context = CommentThreadContext(
+                file_path=relevant_file,
+                right_file_start=CommentPosition(offset=1, line=relevant_lines_start),
+                right_file_end=CommentPosition(offset=1, line=relevant_lines_end))
+            comment = Comment(content=body, comment_type=1)
+            thread = CommentThread(comments=[comment], thread_context=thread_context)
             try:
-                comment = Comment(content=post_parameters["body"], comment_type=1)
-                thread = CommentThread(comments=[comment],
-                                       thread_context={
-                                           "filePath": post_parameters["path"],
-                                           "rightFileStart": {
-                                               "line": post_parameters["start_line"],
-                                               "offset": 1,
-                                           },
-                                           "rightFileEnd": {
-                                               "line": post_parameters["line"],
-                                               "offset": 1,
-                                           },
-                                       })
                 self.azure_devops_client.create_thread(
                     comment_thread=thread,
                     project=self.workspace_slug,
@@ -114,7 +87,7 @@ class AzureDevopsProvider(GitProvider):
                     pull_request_id=self.pr_num
                 )
             except Exception as e:
-                get_logger().warning(f"Azure failed to publish code suggestion, error: {e}")
+                get_logger().error(f"Azure failed to publish code suggestion, error: {e}", suggestion=suggestion)
         return True
 
     def reply_to_comment_from_comment_id(self, comment_id: int, body: str, is_temporary: bool = False) -> Comment:

From db0c213d72347e7dec8476b313985225f8d78e98 Mon Sep 17 00:00:00 2001
From: Thomas De Keulenaer <11250711+twdkeule@users.noreply.github.com>
Date: Fri, 9 May 2025 13:13:29 +0200
Subject: [PATCH 08/15] AzureDevops webhook: allow disabling BasicAuth

Azure webhooks do not allow BasicAuth without HTTPS
---
 .../servers/azuredevops_server_webhook.py     | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/pr_agent/servers/azuredevops_server_webhook.py b/pr_agent/servers/azuredevops_server_webhook.py
index 45533385..8eacbf66 100644
--- a/pr_agent/servers/azuredevops_server_webhook.py
+++ b/pr_agent/servers/azuredevops_server_webhook.py
@@ -28,12 +28,12 @@ from pr_agent.git_providers.utils import apply_repo_settings
 from pr_agent.log import LoggingFormat, get_logger, setup_logger
 
 setup_logger(fmt=LoggingFormat.JSON, level=get_settings().get("CONFIG.LOG_LEVEL", "DEBUG"))
-security = HTTPBasic()
+security = HTTPBasic(auto_error=False)
 router = APIRouter()
 available_commands_rgx = re.compile(r"^\/(" + "|".join(command2class.keys()) + r")\s*")
 azure_devops_server = get_settings().get("azure_devops_server")
-WEBHOOK_USERNAME = azure_devops_server.get("webhook_username")
-WEBHOOK_PASSWORD = azure_devops_server.get("webhook_password")
+WEBHOOK_USERNAME = azure_devops_server.get("webhook_username", None)
+WEBHOOK_PASSWORD = azure_devops_server.get("webhook_password", None)
 
 async def handle_request_comment(url: str, body: str, thread_id: int, comment_id: int, log_context: dict):
     log_context["action"] = body
@@ -78,14 +78,17 @@ def handle_line_comment(body: str, thread_id: int, provider: AzureDevopsProvider
 # currently only basic auth is supported with azure webhooks
 # for this reason, https must be enabled to ensure the credentials are not sent in clear text
 def authorize(credentials: HTTPBasicCredentials = Depends(security)):
-        is_user_ok = secrets.compare_digest(credentials.username, WEBHOOK_USERNAME)
-        is_pass_ok = secrets.compare_digest(credentials.password, WEBHOOK_PASSWORD)
-        if not (is_user_ok and is_pass_ok):
-            raise HTTPException(
-                status_code=status.HTTP_401_UNAUTHORIZED,
-                detail='Incorrect username or password.',
-                headers={'WWW-Authenticate': 'Basic'},
-            )
+    if WEBHOOK_USERNAME is None or WEBHOOK_PASSWORD is None:
+        return
+    
+    is_user_ok = secrets.compare_digest(credentials.username, WEBHOOK_USERNAME)
+    is_pass_ok = secrets.compare_digest(credentials.password, WEBHOOK_PASSWORD)
+    if not (is_user_ok and is_pass_ok):
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail='Incorrect username or password.',
+            headers={'WWW-Authenticate': 'Basic'},
+        )
 
 
 async def _perform_commands_azure(commands_conf: str, agent: PRAgent, api_url: str, log_context: dict):

From d6aaf8a7097cee2e2c6095fc203798c43f0a99f7 Mon Sep 17 00:00:00 2001
From: ofir-frd <85901822+ofir-frd@users.noreply.github.com>
Date: Mon, 12 May 2025 11:13:01 +0300
Subject: [PATCH 09/15] docs: Update "Apply this suggestion" to "Apply / Chat"

---
 docs/docs/tools/improve.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/tools/improve.md b/docs/docs/tools/improve.md
index 5ddb01d9..aa0c051f 100644
--- a/docs/docs/tools/improve.md
+++ b/docs/docs/tools/improve.md
@@ -18,7 +18,7 @@ The tool can be triggered automatically every time a new PR is [opened](../usage
 ___
 
 !!! note "The following features are available only for Qodo Merge 💎 users:"
-    - The `Apply this suggestion` checkbox, which interactively converts a suggestion into a committable code comment
+    - The `Apply / Chat` checkbox, which interactively converts a suggestion into a committable code comment
     - The `More` checkbox to generate additional suggestions
 
 ## Example usage

From 489a16a3e63cdef9dedccbf6317e2a093c33fe7f Mon Sep 17 00:00:00 2001
From: mrT23 <tal.r@codium.ai>
Date: Tue, 13 May 2025 08:05:36 +0300
Subject: [PATCH 10/15] docs: reorganize documentation structure and move PR
 benchmark section

---
 docs/docs/core-abilities/index.md                         | 1 -
 docs/docs/{finetuning_benchmark => pr_benchmark}/index.md | 0
 docs/docs/usage-guide/index.md                            | 1 +
 docs/mkdocs.yml                                           | 6 +++---
 4 files changed, 4 insertions(+), 4 deletions(-)
 rename docs/docs/{finetuning_benchmark => pr_benchmark}/index.md (100%)

diff --git a/docs/docs/core-abilities/index.md b/docs/docs/core-abilities/index.md
index d06a39ae..9af26e2e 100644
--- a/docs/docs/core-abilities/index.md
+++ b/docs/docs/core-abilities/index.md
@@ -3,7 +3,6 @@
 Qodo Merge utilizes a variety of core abilities to provide a comprehensive and efficient code review experience. These abilities include:
 
 - [Auto best practices](https://qodo-merge-docs.qodo.ai/core-abilities/auto_best_practices/)
-- [Pull request benchmark](https://qodo-merge-docs.qodo.ai/finetuning_benchmark/)
 - [Code validation](https://qodo-merge-docs.qodo.ai/core-abilities/code_validation/)
 - [Compression strategy](https://qodo-merge-docs.qodo.ai/core-abilities/compression_strategy/)
 - [Dynamic context](https://qodo-merge-docs.qodo.ai/core-abilities/dynamic_context/)
diff --git a/docs/docs/finetuning_benchmark/index.md b/docs/docs/pr_benchmark/index.md
similarity index 100%
rename from docs/docs/finetuning_benchmark/index.md
rename to docs/docs/pr_benchmark/index.md
diff --git a/docs/docs/usage-guide/index.md b/docs/docs/usage-guide/index.md
index 8de093af..34a66dd5 100644
--- a/docs/docs/usage-guide/index.md
+++ b/docs/docs/usage-guide/index.md
@@ -22,4 +22,5 @@ It includes information on how to adjust Qodo Merge configurations, define which
     - [Working with large PRs](./additional_configurations.md#working-with-large-prs)
     - [Changing a model](./additional_configurations.md#changing-a-model)
     - [Patch Extra Lines](./additional_configurations.md#patch-extra-lines)
+- [FAQ](https://qodo-merge-docs.qodo.ai/faq/)
 - [Qodo Merge Models](./qodo_merge_models)
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 425b2db7..e8dd4390 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -20,6 +20,7 @@ nav:
     - Managing Mail Notifications: 'usage-guide/mail_notifications.md'
     - Changing a Model: 'usage-guide/changing_a_model.md'
     - Additional Configurations: 'usage-guide/additional_configurations.md'
+    - Frequently Asked Questions: 'faq/index.md'
     - 💎 Qodo Merge Models: 'usage-guide/qodo_merge_models.md'
   - Tools:
     - 'tools/index.md'
@@ -43,7 +44,6 @@ nav:
   - Core Abilities:
       - 'core-abilities/index.md'
       - Auto best practices: 'core-abilities/auto_best_practices.md'
-      - Pull request benchmark: 'finetuning_benchmark/index.md'
       - Code validation: 'core-abilities/code_validation.md'
       - Compression strategy: 'core-abilities/compression_strategy.md'
       - Dynamic context: 'core-abilities/dynamic_context.md'
@@ -59,8 +59,8 @@ nav:
       - Features: 'chrome-extension/features.md'
       - Data Privacy: 'chrome-extension/data_privacy.md'
       - Options: 'chrome-extension/options.md'
-  - FAQ:
-      - FAQ: 'faq/index.md'
+  - PR Benchmark:
+      - FAQ: 'pr_benchmark/index.md'
   - Recent Updates:
       - Recent Updates: 'recent_updates/index.md'
   - AI Docs Search: 'ai_search/index.md'

From 25530a8b2c805b07809aa5df18ed822035c2a506 Mon Sep 17 00:00:00 2001
From: mrT23 <tal.r@codium.ai>
Date: Tue, 13 May 2025 08:39:19 +0300
Subject: [PATCH 11/15] docs: add benchmark methodology and improve model
 comparison formatting

---
 docs/docs/pr_benchmark/index.md | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/docs/docs/pr_benchmark/index.md b/docs/docs/pr_benchmark/index.md
index 85c31200..5e25a469 100644
--- a/docs/docs/pr_benchmark/index.md
+++ b/docs/docs/pr_benchmark/index.md
@@ -2,9 +2,23 @@
 
 ## Methodology
 
-...
+Qodo Merge PR Benchmark evaluates and compares the performance of two Large Language Models (LLMs) in analyzing pull request code and providing meaningful code suggestions.
+Our diverse dataset comprises of 400 pull requests from over 100 repositories, spanning various programming languages and frameworks to reflect real-world scenarios.
 
-## Gemini-2.5-pro-preview-05-06
+- For each pull request, two distinct LLMs process the same prompt using the Qodo Merge `improve` tool, each generating two sets of responses. The prompt for response generation can be found [here](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/code_suggestions/pr_code_suggestions_prompts_not_decoupled.toml).
+
+- Subsequently, a high-performing third model (an AI judge) evaluates the responses from the initial two models to determine the superior one. We utilize OpenAI's `o3` model as the judge, though other models have yielded consistent results. The prompt for this comparative judgment is available [here](https://github.com/Codium-ai/pr-agent-settings/tree/main/benchmark).
+
+- We aggregate comparison outcomes across all the pull requests, calculating the win rate for each model. We also analyze the qualitative feedback (the "why" explanations from the judge) to identify each model's comparative strengths and weaknesses.
+This approach provides not just a quantitative score but also a detailed analysis of each model's strengths and weaknesses.
+
+- The final output is a "Model Card", comparing the evaluated model against others. To ensure full transparency and enable community scrutiny, we also share the raw code suggestions generated by each model, and the judge's specific feedback.
+
+Note that this benchmark focuses on quality: the ability of an LLM to process complex pull request with multiple files and nuanced task to produce high-quality code suggestions.
+Other factors like speed, cost, and availability, while also relevant for model selection, are outside this benchmark's scope.
+
+
+## Gemini-2.5-pro-preview-05-06 - Model Card
 
 ### Gemini-2.5-pro-preview-05-06 vs GPT-4.1
 
@@ -14,15 +28,15 @@
 
 Model 'Gemini-2.5-pro-preview-05-06' is generally more useful thanks to wider and more accurate bug detection and concrete patches, but it sacrifices compliance discipline and sometimes oversteps the task rules. Model 'GPT-4.1' is safer and highly rule-abiding, yet often too timid—missing many genuine issues and providing limited insight. An ideal reviewer would combine 'GPT-4.1’ restraint with 'Gemini-2.5-pro-preview-05-06' thoroughness.
 
-#### Gemini-2.5-pro-preview-05-06 vs GPT-4.1  - Detailed Analysis
+#### Detailed Analysis
 
-strengths:  
+Gemini-2.5-pro-preview-05-06 vs GPT-4.1 strengths:  
 
 - better_bug_coverage: Detects and explains more critical issues, winning in ~70 % of comparisons and achieving a higher average score.  
 - actionable_fixes: Supplies clear code snippets, correct language labels, and often multiple coherent suggestions per diff.  
 - deeper_reasoning: Shows stronger grasp of logic, edge cases, and cross-file implications, leading to broader, high-impact reviews.  
 
-weaknesses:  
+Gemini-2.5-pro-preview-05-06 vs GPT-4.1 weaknesses:  
 
 - guideline_violations: More prone to over-eager advice—non-critical tweaks, touching unchanged code, suggesting new imports, or minor format errors.  
 - occasional_overreach: Some fixes are speculative or risky, potentially introducing new bugs.  
@@ -40,15 +54,15 @@ Model 'Gemini-2.5-pro-preview-05-06' is the stronger reviewer—more frequently
 See raw results [here](https://github.com/Codium-ai/pr-agent-settings/blob/main/benchmark/sonnet_37_vs_gemini-2.5-pro-preview-05-06.md)
 
 
-#### Gemini-2.5-pro-preview-05-06 vs Sonnet 3.7  - Detailed Analysis
+#### Detailed Analysis
 
-strengths:  
+Gemini-2.5-pro-preview-05-06 vs Sonnet 3.7 strengths:  
 
 - higher_accuracy_and_coverage: finds real critical bugs and supplies actionable patches in most examples (better in 78 % of cases).  
 - guideline_awareness: usually respects new-lines-only scope, ≤3 suggestions, proper YAML, and stays silent when no issues exist.  
 - detailed_reasoning_and_patches: explanations tie directly to the diff and fixes are concrete, often catching multiple related defects that 'Sonnet 3.7' overlooks.
 
-weaknesses:  
+Gemini-2.5-pro-preview-05-06 vs Sonnet 3.7 weaknesses:  
 
 - occasional_rule_violations: sometimes proposes new imports, package-version changes, or edits outside the added lines.  
 - overzealous_suggestions: may add speculative or stylistic fixes that exceed the “critical” scope, or mis-label severity.  

From 3ec5bc12b7fbf5644533c29be8bf3e8ad764f3c5 Mon Sep 17 00:00:00 2001
From: mrT23 <tal.r@codium.ai>
Date: Tue, 13 May 2025 08:53:03 +0300
Subject: [PATCH 12/15] s

---
 docs/docs/pr_benchmark/index.md | 206 ++++++++++----------------------
 1 file changed, 61 insertions(+), 145 deletions(-)

diff --git a/docs/docs/pr_benchmark/index.md b/docs/docs/pr_benchmark/index.md
index 5e25a469..2624e1b2 100644
--- a/docs/docs/pr_benchmark/index.md
+++ b/docs/docs/pr_benchmark/index.md
@@ -17,6 +17,15 @@ This approach provides not just a quantitative score but also a detailed analysi
 Note that this benchmark focuses on quality: the ability of an LLM to process complex pull request with multiple files and nuanced task to produce high-quality code suggestions.
 Other factors like speed, cost, and availability, while also relevant for model selection, are outside this benchmark's scope.
 
+## TL;DR
+
+Here's a summary of the win rates based on the benchmark:
+
+| Model A                       | Model B                       | Model A Win Rate | Model B Win Rate |
+|-------------------------------|-------------------------------|------------------|------------------|
+| Gemini-2.5-pro-preview-05-06  | GPT-4.1                       | 70.4%            | 29.6%            |
+| Gemini-2.5-pro-preview-05-06  | Sonnet 3.7                    | 78.1%            | 21.9%            |
+| GPT-4.1                       | Sonnet 3.7                    | 61.0%            | 39.0%            |
 
 ## Gemini-2.5-pro-preview-05-06 - Model Card
 
@@ -68,180 +77,87 @@ Gemini-2.5-pro-preview-05-06 vs Sonnet 3.7 weaknesses:
 - overzealous_suggestions: may add speculative or stylistic fixes that exceed the “critical” scope, or mis-label severity.  
 - sporadic_technical_slips: a few patches contain minor coding errors, oversized snippets, or duplicate/contradicting advice.
 
+## GPT-4.1 - Model Card
 
+### GPT-4.1 vs Sonnet 3.7
 
+![Comparison](https://codium.ai/images/qodo_merge_benchmark/gpt-4.1_vs_sonnet_3.7_judge_o3.png){width=768}
 
-[//]: # (On coding tasks, the gap between open-source models and top closed-source models such as Claude and GPT is significant.)
+#### Analysis Summary
 
-[//]: # (<br>)
+Model 'GPT-4.1' is safer and more compliant, preferring silence over speculation, which yields fewer rule breaches and false positives but misses some real bugs.  
+Model 'Sonnet 3.7' is more adventurous and often uncovers important issues that 'GPT-4.1' ignores, yet its aggressive style leads to frequent guideline violations and a higher proportion of incorrect or non-critical advice. 
 
-[//]: # (In practice, open-source models are unsuitable for most real-world code tasks, and require further fine-tuning to produce acceptable results.)
+See raw results [here](https://github.com/Codium-ai/pr-agent-settings/blob/main/benchmark/gpt-4.1_vs_sonnet_3.7_judge_o3.md)
 
-[//]: # ()
-[//]: # (_Qodo Merge pull request benchmark_ aims to benchmark models on their ability to be fine-tuned for a coding task.)
 
-[//]: # (Specifically, we chose to fine-tune open-source models on the task of analyzing a pull request, and providing useful feedback and code suggestions.)
+#### Detailed Analysis
 
-[//]: # ()
-[//]: # (Here are the results:)
+Model 'GPT-4.1' vs 'Sonnet 3.7'  
+strengths:  
+- Strong guideline adherence: usually stays strictly on `+` lines, avoids non-critical or stylistic advice, and rarely suggests forbidden imports; often outputs an empty list when no real bug exists.  
+- Lower false-positive rate: suggestions are more accurate and seldom introduce new bugs; fixes compile more reliably.  
+- Good schema discipline: YAML is almost always well-formed and fields are populated correctly.  
 
-[//]: # (<br>)
+weaknesses:  
+- Misses bugs: often returns an empty list even when a clear critical issue is present, so coverage is narrower.  
+- Sparse feedback: when it does comment, it tends to give fewer suggestions and sometimes lacks depth or completeness.  
+- Occasional metadata/slip-ups (wrong language tags, overly broad code spans), though less harmful than Sonnet 3.7 errors.  
 
-[//]: # (<br>)
+### GPT-4.1 vs Gemini-2.5-pro-preview-05-06
 
-[//]: # ()
-[//]: # (**Model performance:**)
+![Comparison](https://codium.ai/images/qodo_merge_benchmark/gpt-4.1_vs_gemini-2.5-pro-preview-05-06_judge_o3.png){width=768}
 
-[//]: # ()
-[//]: # (| Model name                  | Model size [B] | Better than gpt-4 rate, after fine-tuning [%] |)
+#### Analysis Summary
 
-[//]: # (|-----------------------------|----------------|----------------------------------------------|)
+Model 'Gemini-2.5-pro-preview-05-06' is generally more useful thanks to wider and more accurate bug detection and concrete patches, but it sacrifices compliance discipline and sometimes oversteps the task rules. Model 'GPT-4.1' is safer and highly rule-abiding, yet often too timid—missing many genuine issues and providing limited insight. An ideal reviewer would combine 'GPT-4.1’ restraint with 'Gemini-2.5-pro-preview-05-06' thoroughness.
 
-[//]: # (| **DeepSeek 34B-instruct**   | **34**         | **40.7**                                     |)
+#### Detailed Analysis
 
-[//]: # (| DeepSeek 34B-base           | 34             | 38.2                                         |)
+GPT-4.1 strengths: 
+- strict_compliance: Usually sticks to the “critical bugs only / new ‘+’ lines only” rule, so outputs rarely violate task constraints.  
+- low_risk: Conservative behaviour avoids harmful or speculative fixes; safer when no obvious issue exists.  
+- concise_formatting: Tends to produce minimal, correctly-structured YAML without extra noise.  
 
-[//]: # (| Phind-34b                   | 34             | 38                                           |)
+GPT-4.1 weaknesses:
+- under_detection: Frequently returns an empty list even when real bugs are present, missing ~70 % of the time.  
+- shallow_analysis: When it does suggest fixes, coverage is narrow and technical depth is limited, sometimes with wrong language tags or minor format slips.  
+- occasional_inaccuracy: A few suggestions are unfounded or duplicate, and rare guideline breaches (e.g., import advice) still occur.  
 
-[//]: # (| Granite-34B                 | 34             | 37.6                                         |)
 
-[//]: # (| Codestral-22B-v0.1          | 22             | 32.7                                         |)
+## Sonnet 3.7 - Model Card
 
-[//]: # (| QWEN-1.5-32B                | 32             | 29                                           |)
+### Sonnet 3.7 vs GPT-4.1
 
-[//]: # (|                             |                |                                              |)
+![Comparison](https://codium.ai/images/qodo_merge_benchmark/gpt-4.1_vs_sonnet_3.7_judge_o3.png){width=768}
 
-[//]: # (| **CodeQwen1.5-7B**          | **7**          | **35.4**                                     |)
+#### Analysis Summary
 
-[//]: # (| Llama-3.1-8B-Instruct       | 8              | 35.2                                         |)
+Model 'GPT-4.1' is safer and more compliant, preferring silence over speculation, which yields fewer rule breaches and false positives but misses some real bugs.  
+Model 'Sonnet 3.7' is more adventurous and often uncovers important issues that 'GPT-4.1' ignores, yet its aggressive style leads to frequent guideline violations and a higher proportion of incorrect or non-critical advice. 
 
-[//]: # (| Granite-8b-code-instruct    | 8              | 34.2                                         |)
+See raw results [here](https://github.com/Codium-ai/pr-agent-settings/blob/main/benchmark/gpt-4.1_vs_sonnet_3.7_judge_o3.md)
 
-[//]: # (| CodeLlama-7b-hf             | 7              | 31.8                                         |)
+#### Detailed Analysis
 
-[//]: # (| Gemma-7B                    | 7              | 27.2                                         |)
+Model 'Sonnet 3.7' vs 'GPT-4.1'  
+'Sonnet 3.7' strengths:
+- Better bug discovery breadth: more willing to dive into logic and spot critical problems that 'GPT-4.1' overlooks; often supplies multiple, detailed fixes.  
+- Richer explanations & patches: gives fuller context and, when correct, proposes more functional or user-friendly solutions.  
+- Generally correct language/context tagging and targeted code snippets.  
 
-[//]: # (| DeepSeek coder-7b-instruct  | 7              | 26.8                                         |)
+'Sonnet 3.7' weaknesses:
+- Guideline violations: frequently flags non-critical issues, edits untouched code, or recommends adding imports, breaching task rules.  
+- Higher error rate: suggestions are more speculative and sometimes introduce new defects or duplicate work already done.  
+- Occasional schema or formatting mistakes (missing list value, duplicated suggestions), reducing reliability.  
 
-[//]: # (| Llama-3-8B-Instruct         | 8              | 26.8                                         |)
 
-[//]: # (| Mistral-7B-v0.1             | 7              | 16.1                                         |)
+### Sonnet 3.7 vs Gemini-2.5-pro-preview-05-06
 
-[//]: # ()
-[//]: # (<br>)
+![Comparison](https://codium.ai/images/qodo_merge_benchmark/sonnet_37_vs_gemini-2.5-pro-preview-05-06_judge_o3.png){width=768}
 
-[//]: # ()
-[//]: # (**Fine-tuning impact:**)
+#### Analysis Summary
 
-[//]: # ()
-[//]: # (| Model name                | Model size [B] | Fine-tuned | Better than gpt-4 rate [%] |)
+Model 'Gemini-2.5-pro-preview-05-06' is the stronger reviewer—more frequently identifies genuine, high-impact bugs and provides well-formed, actionable fixes. Model 'Sonnet 3.7' is safer against false positives and tends to be concise but often misses important defects or offers low-value or incorrect suggestions.
 
-[//]: # (|---------------------------|----------------|------------|----------------------------|)
-
-[//]: # (| DeepSeek 34B-instruct     | 34             | yes        | 40.7                       |)
-
-[//]: # (| DeepSeek 34B-instruct     | 34             | no         | 3.6                        |)
-
-[//]: # ()
-[//]: # (## Results analysis)
-
-[//]: # ()
-[//]: # (- **Fine-tuning is a must** - without fine-tuning, open-source models provide poor results on most real-world code tasks, which include complicated prompt and lengthy context. We clearly see that without fine-tuning, deepseek model was 96.4% of the time inferior to GPT-4, while after fine-tuning, it is better 40.7% of the time.)
-
-[//]: # (- **Always start from a code-dedicated model** — When fine-tuning, always start from a code-dedicated model, and not from a general-usage model. The gaps in downstream results are very big.)
-
-[//]: # (- **Don't believe the hype** —newer models, or models from big-tech companies &#40;Llama3, Gemma, Mistral&#41;, are not always better for fine-tuning.)
-
-[//]: # (- **The best large model** - For large 34B code-dedicated models, the gaps when doing proper fine-tuning are small. The current top model is **DeepSeek 34B-instruct**)
-
-[//]: # (- **The best small model** - For small 7B code-dedicated models, the gaps when fine-tuning are much larger. **CodeQWEN 1.5-7B** is by far the best model for fine-tuning.)
-
-[//]: # (- **Base vs. instruct** - For the top model &#40;deepseek&#41;, we saw small advantage when starting from the instruct version. However, we recommend testing both versions on each specific task, as the base model is generally considered more suitable for fine-tuning.)
-
-[//]: # ()
-[//]: # (## Dataset)
-
-[//]: # ()
-[//]: # (### Training dataset)
-
-[//]: # ()
-[//]: # (Our training dataset comprises 25,000 pull requests, aggregated from permissive license repos. For each pull request, we generated responses for the three main tools of Qodo Merge:)
-
-[//]: # ([Describe]&#40;https://qodo-merge-docs.qodo.ai/tools/describe/&#41;, [Review]&#40;https://qodo-merge-docs.qodo.ai/tools/improve/&#41; and [Improve]&#40;https://qodo-merge-docs.qodo.ai/tools/improve/&#41;.)
-
-[//]: # ()
-[//]: # (On the raw data collected, we employed various automatic and manual cleaning techniques to ensure the outputs were of the highest quality, and suitable for instruct-tuning.)
-
-[//]: # ()
-[//]: # (Here are the prompts, and example outputs, used as input-output pairs to fine-tune the models:)
-
-[//]: # ()
-[//]: # (| Tool     | Prompt                                                                                                     | Example output |)
-
-[//]: # (|----------|------------------------------------------------------------------------------------------------------------|----------------|)
-
-[//]: # (| Describe | [link]&#40;https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/pr_description_prompts.toml&#41; | [link]&#40;https://github.com/Codium-ai/pr-agent/pull/910#issue-2303989601&#41;           |)
-
-[//]: # (| Review   | [link]&#40;https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/pr_reviewer_prompts.toml&#41; | [link]&#40;https://github.com/Codium-ai/pr-agent/pull/910#issuecomment-2118761219&#41;           |)
-
-[//]: # (| Improve  | [link]&#40;https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/pr_code_suggestions_prompts.toml&#41; | [link]&#40;https://github.com/Codium-ai/pr-agent/pull/910#issuecomment-2118761309&#41;           |)
-
-[//]: # ()
-[//]: # (### Evaluation dataset)
-
-[//]: # ()
-[//]: # (- For each tool, we aggregated 200 additional examples to be used for evaluation. These examples were not used in the training dataset, and were manually selected to represent diverse real-world use-cases.)
-
-[//]: # (- For each test example, we generated two responses: one from the fine-tuned model, and one from the best code model in the world, `gpt-4-turbo-2024-04-09`.)
-
-[//]: # ()
-[//]: # (- We used a third LLM to judge which response better answers the prompt, and will likely be perceived by a human as better response.)
-
-[//]: # (<br>)
-
-[//]: # ()
-[//]: # (We experimented with three model as judges: `gpt-4-turbo-2024-04-09`, `gpt-4o`, and `claude-3-opus-20240229`. All three produced similar results, with the same ranking order. This strengthens the validity of our testing protocol.)
-
-[//]: # (The evaluation prompt can be found [here]&#40;https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/pr_evaluate_prompt_response.toml&#41;)
-
-[//]: # ()
-[//]: # (Here is an example of a judge model feedback:)
-
-[//]: # ()
-[//]: # (```)
-
-[//]: # (command: improve)
-
-[//]: # (model1_score: 9,)
-
-[//]: # (model2_score: 6,)
-
-[//]: # (why: |)
-
-[//]: # (  Response 1 is better because it provides more actionable and specific suggestions that directly)
-
-[//]: # (  enhance the code's maintainability, performance, and best practices. For example, it suggests)
-
-[//]: # (  using a variable for reusable widget instances and using named routes for navigation, which)
-
-[//]: # (  are practical improvements. In contrast, Response 2 focuses more on general advice and less)
-
-[//]: # (  actionable suggestions, such as changing variable names and adding comments, which are less)
-
-[//]: # (  critical for immediate code improvement.")
-
-[//]: # (```)
-
-[//]: # ()
-[//]: # (## Comparing Top Closed-Source Models)
-
-[//]: # ()
-[//]: # (Another application of the Pull Request Benchmark is comparing leading closed-source models to determine which performs better at analyzing pull request code.)
-
-[//]: # ()
-[//]: # (The evaluation methodology resembles the approach used for evaluating fine-tuned models:)
-
-[//]: # ()
-[//]: # (- We ran each model across 200 diverse pull requests, asking them to generate code suggestions using Qodo Merge's `improve` tool)
-
-[//]: # (- A third top model served as judge to determine which response better fulfilled the prompt and would likely be perceived as superior by human users)
+See raw results [here](https://github.com/Codium-ai/pr-agent-settings/blob/main/benchmark/sonnet_37_vs_gemini-2.5-pro-preview-05-06.md)

From cbfbfa662d210866ac7c89b1c5418332c1a49bd9 Mon Sep 17 00:00:00 2001
From: mrT23 <tal.r@codium.ai>
Date: Tue, 13 May 2025 09:05:07 +0300
Subject: [PATCH 13/15] docs: enhance benchmark table with colored win rates
 and improve comparison headings

---
 docs/docs/pr_benchmark/index.md | 49 +++++++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/docs/docs/pr_benchmark/index.md b/docs/docs/pr_benchmark/index.md
index 2624e1b2..37d2d022 100644
--- a/docs/docs/pr_benchmark/index.md
+++ b/docs/docs/pr_benchmark/index.md
@@ -21,15 +21,42 @@ Other factors like speed, cost, and availability, while also relevant for model
 
 Here's a summary of the win rates based on the benchmark:
 
-| Model A                       | Model B                       | Model A Win Rate | Model B Win Rate |
-|-------------------------------|-------------------------------|------------------|------------------|
-| Gemini-2.5-pro-preview-05-06  | GPT-4.1                       | 70.4%            | 29.6%            |
-| Gemini-2.5-pro-preview-05-06  | Sonnet 3.7                    | 78.1%            | 21.9%            |
-| GPT-4.1                       | Sonnet 3.7                    | 61.0%            | 39.0%            |
+[//]: # (| Model A                        | Model B                        | Model A Win Rate | Model B Win Rate |)
+
+[//]: # (|:-------------------------------|:-------------------------------|:----------------:|:----------------:|)
+
+[//]: # (| Gemini-2.5-pro-preview-05-06   | GPT-4.1                        |      70.4%       |      29.6%       |)
+
+[//]: # (| Gemini-2.5-pro-preview-05-06   | Sonnet 3.7                     |      78.1%       |      21.9%       |)
+
+[//]: # (| GPT-4.1                        | Sonnet 3.7                     |      61.0%       |      39.0%       |)
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align:left;">Model A</th>
+      <th style="text-align:left;">Model B</th>
+      <th style="text-align:center;">Model A Win Rate</th> <th style="text-align:center;">Model B Win Rate</th> </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align:left;">Gemini-2.5-pro-preview-05-06</td>
+      <td style="text-align:left;">GPT-4.1</td>
+      <td style="text-align:center; color: #1E8449;"><b>70.4%</b></td> <td style="text-align:center; color: #D8000C;"><b>29.6%</b></td> </tr>
+    <tr>
+      <td style="text-align:left;">Gemini-2.5-pro-preview-05-06</td>
+      <td style="text-align:left;">Sonnet 3.7</td>
+      <td style="text-align:center; color: #1E8449;"><b>78.1%</b></td> <td style="text-align:center; color: #D8000C;"><b>21.9%</b></td> </tr>
+    <tr>
+      <td style="text-align:left;">GPT-4.1</td>
+      <td style="text-align:left;">Sonnet 3.7</td>
+      <td style="text-align:center; color: #1E8449;"><b>61.0%</b></td> <td style="text-align:center; color: #D8000C;"><b>39.0%</b></td> </tr>
+  </tbody>
+</table>
 
 ## Gemini-2.5-pro-preview-05-06 - Model Card
 
-### Gemini-2.5-pro-preview-05-06 vs GPT-4.1
+### Comparison against GPT-4.1
 
 ![Comparison](https://codium.ai/images/qodo_merge_benchmark/gpt-4.1_vs_gemini-2.5-pro-preview-05-06_judge_o3.png){width=768}
 
@@ -52,7 +79,7 @@ Gemini-2.5-pro-preview-05-06 vs GPT-4.1 weaknesses:
 - redundant_or_duplicate: At times repeats the same point or exceeds the required brevity.  
 
 
-### Gemini-2.5-pro-preview-05-06 vs Sonnet 3.7
+### Comparison against Sonnet 3.7
 
 ![Comparison](https://codium.ai/images/qodo_merge_benchmark/sonnet_37_vs_gemini-2.5-pro-preview-05-06_judge_o3.png){width=768}
 
@@ -79,7 +106,7 @@ Gemini-2.5-pro-preview-05-06 vs Sonnet 3.7 weaknesses:
 
 ## GPT-4.1 - Model Card
 
-### GPT-4.1 vs Sonnet 3.7
+### Comparison against Sonnet 3.7
 
 ![Comparison](https://codium.ai/images/qodo_merge_benchmark/gpt-4.1_vs_sonnet_3.7_judge_o3.png){width=768}
 
@@ -104,7 +131,7 @@ weaknesses:
 - Sparse feedback: when it does comment, it tends to give fewer suggestions and sometimes lacks depth or completeness.  
 - Occasional metadata/slip-ups (wrong language tags, overly broad code spans), though less harmful than Sonnet 3.7 errors.  
 
-### GPT-4.1 vs Gemini-2.5-pro-preview-05-06
+### Comparison against Gemini-2.5-pro-preview-05-06
 
 ![Comparison](https://codium.ai/images/qodo_merge_benchmark/gpt-4.1_vs_gemini-2.5-pro-preview-05-06_judge_o3.png){width=768}
 
@@ -127,7 +154,7 @@ GPT-4.1 weaknesses:
 
 ## Sonnet 3.7 - Model Card
 
-### Sonnet 3.7 vs GPT-4.1
+### Comparison against GPT-4.1
 
 ![Comparison](https://codium.ai/images/qodo_merge_benchmark/gpt-4.1_vs_sonnet_3.7_judge_o3.png){width=768}
 
@@ -152,7 +179,7 @@ Model 'Sonnet 3.7' vs 'GPT-4.1'
 - Occasional schema or formatting mistakes (missing list value, duplicated suggestions), reducing reliability.  
 
 
-### Sonnet 3.7 vs Gemini-2.5-pro-preview-05-06
+### Comparison against Gemini-2.5-pro-preview-05-06
 
 ![Comparison](https://codium.ai/images/qodo_merge_benchmark/sonnet_37_vs_gemini-2.5-pro-preview-05-06_judge_o3.png){width=768}
 

From f0fa27535c3373e1668e66931a286a397e1b25f9 Mon Sep 17 00:00:00 2001
From: mrT23 <tal.r@codium.ai>
Date: Tue, 13 May 2025 09:07:46 +0300
Subject: [PATCH 14/15] docs: improve model comparison headings in benchmark
 documentation

---
 docs/docs/pr_benchmark/index.md | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/docs/docs/pr_benchmark/index.md b/docs/docs/pr_benchmark/index.md
index 37d2d022..7c2c7096 100644
--- a/docs/docs/pr_benchmark/index.md
+++ b/docs/docs/pr_benchmark/index.md
@@ -66,13 +66,13 @@ Model 'Gemini-2.5-pro-preview-05-06' is generally more useful thanks to wider an
 
 #### Detailed Analysis
 
-Gemini-2.5-pro-preview-05-06 vs GPT-4.1 strengths:  
+Gemini-2.5-pro-preview-05-06 strengths:  
 
 - better_bug_coverage: Detects and explains more critical issues, winning in ~70 % of comparisons and achieving a higher average score.  
 - actionable_fixes: Supplies clear code snippets, correct language labels, and often multiple coherent suggestions per diff.  
 - deeper_reasoning: Shows stronger grasp of logic, edge cases, and cross-file implications, leading to broader, high-impact reviews.  
 
-Gemini-2.5-pro-preview-05-06 vs GPT-4.1 weaknesses:  
+Gemini-2.5-pro-preview-05-06 weaknesses:  
 
 - guideline_violations: More prone to over-eager advice—non-critical tweaks, touching unchanged code, suggesting new imports, or minor format errors.  
 - occasional_overreach: Some fixes are speculative or risky, potentially introducing new bugs.  
@@ -92,13 +92,13 @@ See raw results [here](https://github.com/Codium-ai/pr-agent-settings/blob/main/
 
 #### Detailed Analysis
 
-Gemini-2.5-pro-preview-05-06 vs Sonnet 3.7 strengths:  
+Gemini-2.5-pro-preview-05-06 strengths:  
 
 - higher_accuracy_and_coverage: finds real critical bugs and supplies actionable patches in most examples (better in 78 % of cases).  
 - guideline_awareness: usually respects new-lines-only scope, ≤3 suggestions, proper YAML, and stays silent when no issues exist.  
 - detailed_reasoning_and_patches: explanations tie directly to the diff and fixes are concrete, often catching multiple related defects that 'Sonnet 3.7' overlooks.
 
-Gemini-2.5-pro-preview-05-06 vs Sonnet 3.7 weaknesses:  
+Gemini-2.5-pro-preview-05-06 weaknesses:  
 
 - occasional_rule_violations: sometimes proposes new imports, package-version changes, or edits outside the added lines.  
 - overzealous_suggestions: may add speculative or stylistic fixes that exceed the “critical” scope, or mis-label severity.  
@@ -120,13 +120,12 @@ See raw results [here](https://github.com/Codium-ai/pr-agent-settings/blob/main/
 
 #### Detailed Analysis
 
-Model 'GPT-4.1' vs 'Sonnet 3.7'  
-strengths:  
+GPT-4.1 strengths:  
 - Strong guideline adherence: usually stays strictly on `+` lines, avoids non-critical or stylistic advice, and rarely suggests forbidden imports; often outputs an empty list when no real bug exists.  
 - Lower false-positive rate: suggestions are more accurate and seldom introduce new bugs; fixes compile more reliably.  
 - Good schema discipline: YAML is almost always well-formed and fields are populated correctly.  
 
-weaknesses:  
+GPT-4.1 weaknesses:  
 - Misses bugs: often returns an empty list even when a clear critical issue is present, so coverage is narrower.  
 - Sparse feedback: when it does comment, it tends to give fewer suggestions and sometimes lacks depth or completeness.  
 - Occasional metadata/slip-ups (wrong language tags, overly broad code spans), though less harmful than Sonnet 3.7 errors.  
@@ -167,7 +166,6 @@ See raw results [here](https://github.com/Codium-ai/pr-agent-settings/blob/main/
 
 #### Detailed Analysis
 
-Model 'Sonnet 3.7' vs 'GPT-4.1'  
 'Sonnet 3.7' strengths:
 - Better bug discovery breadth: more willing to dive into logic and spot critical problems that 'GPT-4.1' overlooks; often supplies multiple, detailed fixes.  
 - Richer explanations & patches: gives fuller context and, when correct, proposes more functional or user-friendly solutions.  

From 87f4783fa0677d15ddacc13b3ad3a42c9811a2cb Mon Sep 17 00:00:00 2001
From: Tal <tal.r@codium.ai>
Date: Tue, 13 May 2025 09:11:06 +0300
Subject: [PATCH 15/15] Update docs/mkdocs.yml

Co-authored-by: qodo-merge-pro-for-open-source[bot] <189517486+qodo-merge-pro-for-open-source[bot]@users.noreply.github.com>
---
 docs/mkdocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index e8dd4390..8525fdee 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -60,7 +60,7 @@ nav:
       - Data Privacy: 'chrome-extension/data_privacy.md'
       - Options: 'chrome-extension/options.md'
   - PR Benchmark:
-      - FAQ: 'pr_benchmark/index.md'
+      - PR Benchmark: 'pr_benchmark/index.md'
   - Recent Updates:
       - Recent Updates: 'recent_updates/index.md'
   - AI Docs Search: 'ai_search/index.md'