feat: add prompt example duplication option for improved model output

2025-07-21 04:50:39 +08:00 · 2025-01-02 12:25:42 +02:00
parent 5971a06d73
commit 5318047202
5 changed files with 204 additions and 9 deletions
--- a/docs/docs/usage-guide/changing_a_model.md
+++ b/docs/docs/usage-guide/changing_a_model.md
@ -32,20 +32,26 @@ fallback_models=["..."]

 ### Ollama

-**Local**
-You can run Hugging Face models locally through either [VLLM](https://docs.litellm.ai/docs/providers/vllm) or [Ollama](https://docs.litellm.ai/docs/providers/ollama)
+You can run models locally through either [VLLM](https://docs.litellm.ai/docs/providers/vllm) or [Ollama](https://docs.litellm.ai/docs/providers/ollama)

-E.g. to use a new Hugging Face model locally via Ollama, set:
+E.g. to use a new model locally via Ollama, set in `.secrets.toml` or in a configuration file:
 ```
-[config] # in configuration.toml
-model = "ollama/llama2"
-fallback_models=["ollama/llama2"]
-custom_model_max_tokens=... # set the maximal input tokens for the model
+[config]
+model = "ollama/qwen2.5-coder:32b"
+fallback_models=["ollama/qwen2.5-coder:32b"]
+custom_model_max_tokens=128000 # set the maximal input tokens for the model
+duplicate_examples=true # will duplicate the examples in the prompt, to help the model to output structured output

-[ollama] # in .secrets.toml
+[ollama]
 api_base = "http://localhost:11434" # or whatever port you're running Ollama on
 ```

+!!! note "Local models vs commercial models"
+    Qodo Merge is compatible with almost any AI model, but analyzing complex code repositories and pull requests requires a model specifically optimized for code analysis.
+    Commercial models such as GPT-4, Claude Sonnet, and Gemini have demonstrated robust capabilities in generating structured output for code analysis. In contrast, most open-source models currently available (as of January 2025) face challenges with these complex tasks.
+    Based on our testing, local open-source models are suitable for experimentation and learning purposes, but they may not be suitable for production-level code analysis tasks.
+    Hence, for production workflows and real-world code analysis, we recommend using commercial models.
+
 ### Hugging Face Inference Endpoints

 To use a new model with Hugging Face Inference Endpoints, for example, set:
--- a/pr_agent/settings/configuration.toml
+++ b/pr_agent/settings/configuration.toml
@ -34,6 +34,7 @@ ai_disclaimer_title=""  # Pro feature, title for a collapsible disclaimer to AI
 ai_disclaimer=""  # Pro feature, full text for the AI disclaimer
 output_relevant_configurations=false
 large_patch_policy = "clip" # "clip", "skip"
+duplicate_prompt_examples = false
 # seed
 seed=-1 # set positive value to fix the seed (and ensure temperature=0)
 temperature=0.2
--- a/pr_agent/settings/pr_description_prompts.toml
+++ b/pr_agent/settings/pr_description_prompts.toml
@ -130,6 +130,35 @@ The PR Git Diff:

 Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines.

+{%- if duplicate_prompt_examples %}
+
+
+Example output:
+```yaml
+type:
+- Bug fix
+- Refactoring
+- ...
+description: |
+  ...
+title: |
+  ...
+{%- if enable_semantic_files_types %}
+pr_files:
+- filename: |
+    ...
+{%- if include_file_summary_changes %}
+  changes_summary: |
+    ...
+{%- endif %}
+  changes_title: |
+    ...
+  label: |
+    label_key_1
+...
+{%- endif %}
+```
+{%- endif %}

 Response (should be a valid YAML, and nothing else):
 ```yaml
--- a/pr_agent/settings/pr_description_prompts_json.toml
+++ b/pr_agent/settings/pr_description_prompts_json.toml
@ -0,0 +1,158 @@
+[pr_description_prompt]
+system="""You are PR-Reviewer, a language model designed to review a Git Pull Request (PR).
+Your task is to provide a full description for the PR content - type, description, title and files walkthrough.
+- Focus on the new PR code (lines starting with '+' in the 'PR Git Diff' section).
+- Keep in mind that the 'Previous title', 'Previous description' and 'Commit messages' sections may be partial, simplistic, non-informative or out of date. Hence, compare them to the PR diff code, and use them only as a reference.
+- The generated title and description should prioritize the most significant changes.
+- When quoting variables, names or file paths from the code, use backticks (`) instead of single quote (').
+
+{%- if extra_instructions %}
+
+Extra instructions from the user:
+=====
+{{extra_instructions}}
+=====
+{% endif %}
+
+
+The output must be a JSON object equivalent to type $PRDescription, according to the following Pydantic definitions:
+=====
+class PRType(str, Enum):
+    bug_fix = "Bug fix"
+    tests = "Tests"
+    enhancement = "Enhancement"
+    documentation = "Documentation"
+    other = "Other"
+
+{%- if enable_custom_labels %}
+
+{{ custom_labels_class }}
+
+{%- endif %}
+
+{%- if enable_semantic_files_types %}
+
+class FileDescription(BaseModel):
+    filename: str = Field(description="The full file path of the relevant file")
+{%- if include_file_summary_changes %}
+    changes_summary: str = Field(description="concise summary of the changes in the relevant file, in bullet points (1-4 bullet points).")
+{%- endif %}
+    changes_title: str = Field(description="one-line summary (5-10 words) capturing the main theme of changes in the file")
+    label: str = Field(description="a single semantic label that represents a type of code changes that occurred in the File. Possible values (partial list): 'bug fix', 'tests', 'enhancement', 'documentation', 'error handling', 'configuration changes', 'dependencies', 'formatting', 'miscellaneous', ...")
+{%- endif %}
+
+class PRDescription(BaseModel):
+    type: List[PRType] = Field(description="one or more types that describe the PR content. Return the label member value (e.g. 'Bug fix', not 'bug_fix')")
+    description: str = Field(description="summarize the PR changes in up to four bullet points, each up to 8 words. For large PRs, add sub-bullets if needed. Order bullets by importance, with each bullet highlighting a key change group.")
+    title: str = Field(description="a concise and descriptive title that captures the PR's main theme")
+{%- if enable_semantic_files_types %}
+    pr_files: List[FileDescription] = Field(max_items=20, description="a list of all the files that were changed in the PR, and summary of their changes. Each file must be analyzed regardless of change size.")
+{%- endif %}
+=====
+
+
+Example output:
+
+```json
+{
+  "type": [
+    "...",
+    "..."
+  ],
+  "description": "...",
+  "title": "..."
+{%- if enable_semantic_files_types %},
+  "pr_files": [
+    {
+      "filename": "...",
+{%- if include_file_summary_changes %}
+      "changes_summary": "...",
+{%- endif %}
+      "changes_title": "...",
+      "label": "label_key_1"
+    }
+  ]
+{%- endif %}
+}
+
+
+Answer should be a valid JSON, and nothing else.
+"""
+
+user="""
+{%- if related_tickets %}
+Related Ticket Info:
+{% for ticket in related_tickets %}
+=====
+Ticket Title: '{{ ticket.title }}'
+{%- if ticket.labels %}
+Ticket Labels: {{ ticket.labels }}
+{%- endif %}
+{%- if ticket.body %}
+Ticket Description:
+#####
+{{ ticket.body }}
+#####
+{%- endif %}
+=====
+{% endfor %}
+{%- endif %}
+
+PR Info:
+
+Previous title: '{{title}}'
+
+{%- if description %}
+
+Previous description:
+=====
+{{ description|trim }}
+=====
+{%- endif %}
+
+Branch: '{{branch}}'
+
+{%- if commit_messages_str %}
+
+Commit messages:
+=====
+{{ commit_messages_str|trim }}
+=====
+{%- endif %}
+
+
+The PR Git Diff:
+=====
+{{ diff|trim }}
+=====
+
+Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines.
+
+
+Example output:
+```json
+{
+  "type": [
+    "...",
+    "..."
+  ],
+  "description": "...",
+  "title": "..."
+{%- if enable_semantic_files_types %},
+  "pr_files": [
+    {
+      "filename": "...",
+{%- if include_file_summary_changes %}
+      "changes_summary": "...",
+{%- endif %}
+      "changes_title": "...",
+      "label": "label_key_1"
+    }
+  ]
+{%- endif %}
+}
+
+
+Response (should be a valid JSON, and nothing else):
+```json
+"""
--- a/pr_agent/tools/pr_description.py
+++ b/pr_agent/tools/pr_description.py
@ -71,7 +71,8 @@ class PRDescription:
            "custom_labels_class": "",  # will be filled if necessary in 'set_custom_labels' function
            "enable_semantic_files_types": get_settings().pr_description.enable_semantic_files_types,
            "related_tickets": "",
-            "include_file_summary_changes": len(self.git_provider.get_diff_files()) <= self.COLLAPSIBLE_FILE_LIST_THRESHOLD
+            "include_file_summary_changes": len(self.git_provider.get_diff_files()) <= self.COLLAPSIBLE_FILE_LIST_THRESHOLD,
+            'duplicate_prompt_examples': get_settings().config.get('duplicate_prompt_examples', False),
        }

        self.user_description = self.git_provider.get_user_description()