feat: add prompt example duplication option for improved model output

This commit is contained in:
mrT23
2025-01-02 12:25:42 +02:00
parent 5971a06d73
commit 5318047202
5 changed files with 204 additions and 9 deletions

View File

@ -32,20 +32,26 @@ fallback_models=["..."]
### Ollama
**Local**
You can run Hugging Face models locally through either [VLLM](https://docs.litellm.ai/docs/providers/vllm) or [Ollama](https://docs.litellm.ai/docs/providers/ollama)
You can run models locally through either [VLLM](https://docs.litellm.ai/docs/providers/vllm) or [Ollama](https://docs.litellm.ai/docs/providers/ollama)
E.g. to use a new Hugging Face model locally via Ollama, set:
E.g. to use a new model locally via Ollama, set in `.secrets.toml` or in a configuration file:
```
[config] # in configuration.toml
model = "ollama/llama2"
fallback_models=["ollama/llama2"]
custom_model_max_tokens=... # set the maximal input tokens for the model
[config]
model = "ollama/qwen2.5-coder:32b"
fallback_models=["ollama/qwen2.5-coder:32b"]
custom_model_max_tokens=128000 # set the maximal input tokens for the model
duplicate_examples=true # will duplicate the examples in the prompt, to help the model to output structured output
[ollama] # in .secrets.toml
[ollama]
api_base = "http://localhost:11434" # or whatever port you're running Ollama on
```
!!! note "Local models vs commercial models"
Qodo Merge is compatible with almost any AI model, but analyzing complex code repositories and pull requests requires a model specifically optimized for code analysis.
Commercial models such as GPT-4, Claude Sonnet, and Gemini have demonstrated robust capabilities in generating structured output for code analysis. In contrast, most open-source models currently available (as of January 2025) face challenges with these complex tasks.
Based on our testing, local open-source models are suitable for experimentation and learning purposes, but they may not be suitable for production-level code analysis tasks.
Hence, for production workflows and real-world code analysis, we recommend using commercial models.
### Hugging Face Inference Endpoints
To use a new model with Hugging Face Inference Endpoints, for example, set:

View File

@ -34,6 +34,7 @@ ai_disclaimer_title="" # Pro feature, title for a collapsible disclaimer to AI
ai_disclaimer="" # Pro feature, full text for the AI disclaimer
output_relevant_configurations=false
large_patch_policy = "clip" # "clip", "skip"
duplicate_prompt_examples = false
# seed
seed=-1 # set positive value to fix the seed (and ensure temperature=0)
temperature=0.2

View File

@ -130,6 +130,35 @@ The PR Git Diff:
Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines.
{%- if duplicate_prompt_examples %}
Example output:
```yaml
type:
- Bug fix
- Refactoring
- ...
description: |
...
title: |
...
{%- if enable_semantic_files_types %}
pr_files:
- filename: |
...
{%- if include_file_summary_changes %}
changes_summary: |
...
{%- endif %}
changes_title: |
...
label: |
label_key_1
...
{%- endif %}
```
{%- endif %}
Response (should be a valid YAML, and nothing else):
```yaml

View File

@ -0,0 +1,158 @@
[pr_description_prompt]
system="""You are PR-Reviewer, a language model designed to review a Git Pull Request (PR).
Your task is to provide a full description for the PR content - type, description, title and files walkthrough.
- Focus on the new PR code (lines starting with '+' in the 'PR Git Diff' section).
- Keep in mind that the 'Previous title', 'Previous description' and 'Commit messages' sections may be partial, simplistic, non-informative or out of date. Hence, compare them to the PR diff code, and use them only as a reference.
- The generated title and description should prioritize the most significant changes.
- When quoting variables, names or file paths from the code, use backticks (`) instead of single quote (').
{%- if extra_instructions %}
Extra instructions from the user:
=====
{{extra_instructions}}
=====
{% endif %}
The output must be a JSON object equivalent to type $PRDescription, according to the following Pydantic definitions:
=====
class PRType(str, Enum):
bug_fix = "Bug fix"
tests = "Tests"
enhancement = "Enhancement"
documentation = "Documentation"
other = "Other"
{%- if enable_custom_labels %}
{{ custom_labels_class }}
{%- endif %}
{%- if enable_semantic_files_types %}
class FileDescription(BaseModel):
filename: str = Field(description="The full file path of the relevant file")
{%- if include_file_summary_changes %}
changes_summary: str = Field(description="concise summary of the changes in the relevant file, in bullet points (1-4 bullet points).")
{%- endif %}
changes_title: str = Field(description="one-line summary (5-10 words) capturing the main theme of changes in the file")
label: str = Field(description="a single semantic label that represents a type of code changes that occurred in the File. Possible values (partial list): 'bug fix', 'tests', 'enhancement', 'documentation', 'error handling', 'configuration changes', 'dependencies', 'formatting', 'miscellaneous', ...")
{%- endif %}
class PRDescription(BaseModel):
type: List[PRType] = Field(description="one or more types that describe the PR content. Return the label member value (e.g. 'Bug fix', not 'bug_fix')")
description: str = Field(description="summarize the PR changes in up to four bullet points, each up to 8 words. For large PRs, add sub-bullets if needed. Order bullets by importance, with each bullet highlighting a key change group.")
title: str = Field(description="a concise and descriptive title that captures the PR's main theme")
{%- if enable_semantic_files_types %}
pr_files: List[FileDescription] = Field(max_items=20, description="a list of all the files that were changed in the PR, and summary of their changes. Each file must be analyzed regardless of change size.")
{%- endif %}
=====
Example output:
```json
{
"type": [
"...",
"..."
],
"description": "...",
"title": "..."
{%- if enable_semantic_files_types %},
"pr_files": [
{
"filename": "...",
{%- if include_file_summary_changes %}
"changes_summary": "...",
{%- endif %}
"changes_title": "...",
"label": "label_key_1"
}
]
{%- endif %}
}
Answer should be a valid JSON, and nothing else.
"""
user="""
{%- if related_tickets %}
Related Ticket Info:
{% for ticket in related_tickets %}
=====
Ticket Title: '{{ ticket.title }}'
{%- if ticket.labels %}
Ticket Labels: {{ ticket.labels }}
{%- endif %}
{%- if ticket.body %}
Ticket Description:
#####
{{ ticket.body }}
#####
{%- endif %}
=====
{% endfor %}
{%- endif %}
PR Info:
Previous title: '{{title}}'
{%- if description %}
Previous description:
=====
{{ description|trim }}
=====
{%- endif %}
Branch: '{{branch}}'
{%- if commit_messages_str %}
Commit messages:
=====
{{ commit_messages_str|trim }}
=====
{%- endif %}
The PR Git Diff:
=====
{{ diff|trim }}
=====
Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines.
Example output:
```json
{
"type": [
"...",
"..."
],
"description": "...",
"title": "..."
{%- if enable_semantic_files_types %},
"pr_files": [
{
"filename": "...",
{%- if include_file_summary_changes %}
"changes_summary": "...",
{%- endif %}
"changes_title": "...",
"label": "label_key_1"
}
]
{%- endif %}
}
Response (should be a valid JSON, and nothing else):
```json
"""

View File

@ -71,7 +71,8 @@ class PRDescription:
"custom_labels_class": "", # will be filled if necessary in 'set_custom_labels' function
"enable_semantic_files_types": get_settings().pr_description.enable_semantic_files_types,
"related_tickets": "",
"include_file_summary_changes": len(self.git_provider.get_diff_files()) <= self.COLLAPSIBLE_FILE_LIST_THRESHOLD
"include_file_summary_changes": len(self.git_provider.get_diff_files()) <= self.COLLAPSIBLE_FILE_LIST_THRESHOLD,
'duplicate_prompt_examples': get_settings().config.get('duplicate_prompt_examples', False),
}
self.user_description = self.git_provider.get_user_description()