mirror of
https://github.com/qodo-ai/pr-agent.git
synced 2025-07-05 21:30:40 +08:00
@ -1,7 +1,6 @@
|
|||||||
FROM python:3.12.3 AS base
|
FROM python:3.12.3 AS base
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
ADD docs/chroma_db.zip /app/docs/chroma_db.zip
|
|
||||||
ADD pyproject.toml .
|
ADD pyproject.toml .
|
||||||
ADD requirements.txt .
|
ADD requirements.txt .
|
||||||
RUN pip install . && rm pyproject.toml requirements.txt
|
RUN pip install . && rm pyproject.toml requirements.txt
|
||||||
|
Binary file not shown.
@ -1,20 +1,24 @@
|
|||||||
[pr_help_prompts]
|
[pr_help_prompts]
|
||||||
system="""You are Doc-helper, a language models designed to answer questions about a documentation website for an open-soure project called "PR-Agent" (recently renamed to "Qodo Merge").
|
system="""You are Doc-helper, a language models designed to answer questions about a documentation website for an open-soure project called "PR-Agent" (recently renamed to "Qodo Merge").
|
||||||
You will recieve a question, and a list of snippets that were collected for a documentation site using RAG as the retrieval method.
|
You will recieve a question, and the full documentation website content.
|
||||||
Your goal is to provide the best answer to the question using the snippets provided.
|
Your goal is to provide the best answer to the question using the documentation provided.
|
||||||
|
|
||||||
Additional instructions:
|
Additional instructions:
|
||||||
- Try to be short and concise in your answers. Give examples if needed.
|
- Try to be short and concise in your answers. Try to give examples if needed.
|
||||||
- It is possible some of the snippets may not be relevant to the question. In that case, you should ignore them and focus on the ones that are relevant.
|
|
||||||
- The main tools of PR-Agent are 'describe', 'review', 'improve'. If there is ambiguity to which tool the user is referring to, prioritize snippets of these tools over others.
|
- The main tools of PR-Agent are 'describe', 'review', 'improve'. If there is ambiguity to which tool the user is referring to, prioritize snippets of these tools over others.
|
||||||
|
- If the question has ambiguity and can relate to different tools or platfroms, provide the best answer possible based on what is available, but also state in your answer what additional information would be needed to give a more accurate answer.
|
||||||
|
|
||||||
|
|
||||||
The output must be a YAML object equivalent to type $DocHelper, according to the following Pydantic definitions:
|
The output must be a YAML object equivalent to type $DocHelper, according to the following Pydantic definitions:
|
||||||
=====
|
=====
|
||||||
|
class relevant_section(BaseModel):
|
||||||
|
file_name: str = Field(description="The name of the relevant file")
|
||||||
|
relevant_section_header_string: str = Field(description="From the relevant file, exact text of the relevant section heading. If no markdown heading is relevant, return empty string")
|
||||||
|
|
||||||
class DocHelper(BaseModel):
|
class DocHelper(BaseModel):
|
||||||
user_question: str = Field(description="The user's question")
|
user_question: str = Field(description="The user's question")
|
||||||
response: str = Field(description="The response to the user's question")
|
response: str = Field(description="The response to the user's question")
|
||||||
relevant_snippets: List[int] = Field(description="One-based index of the relevant snippets in the list of snippets provided. Order the by relevance, with the most relevant first. If a snippet was not relevant, do not include it in the list.")
|
relevant_sections: List[relevant_section] = Field(description="A list of the relevant markdown sections in the documentation that answer the user's question, ordered by importance (most relevant first)")
|
||||||
=====
|
=====
|
||||||
|
|
||||||
|
|
||||||
@ -24,10 +28,11 @@ user_question: |
|
|||||||
...
|
...
|
||||||
response: |
|
response: |
|
||||||
...
|
...
|
||||||
relevant_snippets:
|
relevant_sections:
|
||||||
- 2
|
- file_name: "src/file1.py"
|
||||||
- 1
|
relevant_section_header_string: |
|
||||||
- 4
|
...
|
||||||
|
- ...
|
||||||
"""
|
"""
|
||||||
|
|
||||||
user="""\
|
user="""\
|
||||||
@ -37,7 +42,7 @@ User's Question:
|
|||||||
=====
|
=====
|
||||||
|
|
||||||
|
|
||||||
Relevant doc snippets retrieved:
|
Documentation website content:
|
||||||
=====
|
=====
|
||||||
{{ snippets|trim }}
|
{{ snippets|trim }}
|
||||||
=====
|
=====
|
||||||
|
@ -1,19 +1,17 @@
|
|||||||
import os
|
|
||||||
import traceback
|
|
||||||
import zipfile
|
|
||||||
import tempfile
|
|
||||||
import copy
|
import copy
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from jinja2 import Environment, StrictUndefined
|
from jinja2 import Environment, StrictUndefined
|
||||||
|
|
||||||
|
from pr_agent.algo import MAX_TOKENS
|
||||||
from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler
|
from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler
|
||||||
from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler
|
from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler
|
||||||
from pr_agent.algo.pr_processing import retry_with_fallback_models
|
from pr_agent.algo.pr_processing import retry_with_fallback_models
|
||||||
from pr_agent.algo.token_handler import TokenHandler
|
from pr_agent.algo.token_handler import TokenHandler
|
||||||
from pr_agent.algo.utils import ModelType, load_yaml
|
from pr_agent.algo.utils import ModelType, load_yaml, clip_tokens
|
||||||
from pr_agent.config_loader import get_settings
|
from pr_agent.config_loader import get_settings
|
||||||
from pr_agent.git_providers import get_git_provider, GithubProvider, BitbucketServerProvider, \
|
from pr_agent.git_providers import GithubProvider, BitbucketServerProvider, \
|
||||||
get_git_provider_with_context
|
get_git_provider_with_context
|
||||||
from pr_agent.log import get_logger
|
from pr_agent.log import get_logger
|
||||||
|
|
||||||
@ -67,83 +65,6 @@ class PRHelpMessage:
|
|||||||
question_str = ""
|
question_str = ""
|
||||||
return question_str
|
return question_str
|
||||||
|
|
||||||
def get_sim_results_from_s3_db(self, embeddings):
|
|
||||||
get_logger().info("Loading the S3 index...")
|
|
||||||
sim_results = []
|
|
||||||
try:
|
|
||||||
from langchain_chroma import Chroma
|
|
||||||
from urllib import request
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
# Define the local file path within the temporary directory
|
|
||||||
local_file_path = os.path.join(temp_dir, 'chroma_db.zip')
|
|
||||||
|
|
||||||
bucket = 'pr-agent'
|
|
||||||
file_name = 'chroma_db.zip'
|
|
||||||
s3_url = f'https://{bucket}.s3.amazonaws.com/{file_name}'
|
|
||||||
request.urlretrieve(s3_url, local_file_path)
|
|
||||||
|
|
||||||
# # Download the file from S3 to the temporary directory
|
|
||||||
# s3 = boto3.client('s3')
|
|
||||||
# s3.download_file(bucket, file_name, local_file_path)
|
|
||||||
|
|
||||||
# Extract the contents of the zip file
|
|
||||||
with zipfile.ZipFile(local_file_path, 'r') as zip_ref:
|
|
||||||
zip_ref.extractall(temp_dir)
|
|
||||||
|
|
||||||
vectorstore = Chroma(persist_directory=temp_dir + "/chroma_db",
|
|
||||||
embedding_function=embeddings)
|
|
||||||
sim_results = vectorstore.similarity_search_with_score(self.question_str, k=self.num_retrieved_snippets)
|
|
||||||
except Exception as e:
|
|
||||||
get_logger().error(f"Error while getting sim from S3: {e}",
|
|
||||||
artifact={"traceback": traceback.format_exc()})
|
|
||||||
return sim_results
|
|
||||||
|
|
||||||
def get_sim_results_from_local_db(self, embeddings):
|
|
||||||
get_logger().info("Loading the local index...")
|
|
||||||
sim_results = []
|
|
||||||
try:
|
|
||||||
from langchain_chroma import Chroma
|
|
||||||
get_logger().info("Loading the Chroma index...")
|
|
||||||
db_path = "./docs/chroma_db.zip"
|
|
||||||
if not os.path.exists(db_path):
|
|
||||||
db_path= "/app/docs/chroma_db.zip"
|
|
||||||
if not os.path.exists(db_path):
|
|
||||||
get_logger().error("Local db not found")
|
|
||||||
return sim_results
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
|
|
||||||
# Extract the ZIP file
|
|
||||||
with zipfile.ZipFile(db_path, 'r') as zip_ref:
|
|
||||||
zip_ref.extractall(temp_dir)
|
|
||||||
|
|
||||||
vectorstore = Chroma(persist_directory=temp_dir + "/chroma_db",
|
|
||||||
embedding_function=embeddings)
|
|
||||||
|
|
||||||
# Do similarity search
|
|
||||||
sim_results = vectorstore.similarity_search_with_score(self.question_str, k=self.num_retrieved_snippets)
|
|
||||||
except Exception as e:
|
|
||||||
get_logger().error(f"Error while getting sim from local db: {e}",
|
|
||||||
artifact={"traceback": traceback.format_exc()})
|
|
||||||
return sim_results
|
|
||||||
|
|
||||||
def get_sim_results_from_pinecone_db(self, embeddings):
|
|
||||||
get_logger().info("Loading the Pinecone index...")
|
|
||||||
sim_results = []
|
|
||||||
try:
|
|
||||||
from langchain_pinecone import PineconeVectorStore
|
|
||||||
INDEX_NAME = "pr-agent-docs"
|
|
||||||
vectorstore = PineconeVectorStore(
|
|
||||||
index_name=INDEX_NAME, embedding=embeddings,
|
|
||||||
pinecone_api_key=get_settings().pinecone.api_key
|
|
||||||
)
|
|
||||||
|
|
||||||
# Do similarity search
|
|
||||||
sim_results = vectorstore.similarity_search_with_score(self.question_str, k=self.num_retrieved_snippets)
|
|
||||||
except Exception as e:
|
|
||||||
get_logger().error(f"Error while getting sim from Pinecone db: {e}",
|
|
||||||
artifact={"traceback": traceback.format_exc()})
|
|
||||||
return sim_results
|
|
||||||
|
|
||||||
async def run(self):
|
async def run(self):
|
||||||
try:
|
try:
|
||||||
if self.question_str:
|
if self.question_str:
|
||||||
@ -157,38 +78,49 @@ class PRHelpMessage:
|
|||||||
get_logger().error("The `Help` tool chat feature requires an OpenAI API key for calculating embeddings")
|
get_logger().error("The `Help` tool chat feature requires an OpenAI API key for calculating embeddings")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Initialize embeddings
|
# current path
|
||||||
from langchain_openai import OpenAIEmbeddings
|
docs_path= Path(__file__).parent.parent.parent / 'docs' / 'docs'
|
||||||
embeddings = OpenAIEmbeddings(model="text-embedding-3-small",
|
# get all the 'md' files inside docs_path and its subdirectories
|
||||||
api_key=get_settings().openai.key)
|
md_files = list(docs_path.glob('**/*.md'))
|
||||||
|
folders_to_exclude = ['/finetuning_benchmark/']
|
||||||
|
files_to_exclude = {'EXAMPLE_BEST_PRACTICE.md', 'compression_strategy.md', '/docs/overview/index.md'}
|
||||||
|
md_files = [file for file in md_files if not any(folder in str(file) for folder in folders_to_exclude) and not any(file.name == file_to_exclude for file_to_exclude in files_to_exclude)]
|
||||||
|
|
||||||
# Get similar snippets via similarity search
|
# sort the 'md_files' so that 'priority_files' will be at the top
|
||||||
if get_settings().pr_help.force_local_db:
|
priority_files_strings = ['/docs/index.md', '/usage-guide', 'tools/describe.md', 'tools/review.md',
|
||||||
sim_results = self.get_sim_results_from_local_db(embeddings)
|
'tools/improve.md', '/faq']
|
||||||
elif get_settings().get('pinecone.api_key'):
|
md_files_priority = [file for file in md_files if
|
||||||
sim_results = self.get_sim_results_from_pinecone_db(embeddings)
|
any(priority_string in str(file) for priority_string in priority_files_strings)]
|
||||||
else:
|
md_files_not_priority = [file for file in md_files if file not in md_files_priority]
|
||||||
sim_results = self.get_sim_results_from_s3_db(embeddings)
|
md_files = md_files_priority + md_files_not_priority
|
||||||
if not sim_results:
|
|
||||||
get_logger().info("Failed to load the S3 index. Loading the local index...")
|
|
||||||
sim_results = self.get_sim_results_from_local_db(embeddings)
|
|
||||||
if not sim_results:
|
|
||||||
get_logger().error("Failed to retrieve similar snippets. Exiting...")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Prepare relevant snippets
|
docs_prompt = ""
|
||||||
relevant_pages_full, relevant_snippets_full_header, relevant_snippets_str =\
|
for file in md_files:
|
||||||
await self.prepare_relevant_snippets(sim_results)
|
try:
|
||||||
self.vars['snippets'] = relevant_snippets_str.strip()
|
with open(file, 'r') as f:
|
||||||
|
file_path = str(file).replace(str(docs_path), '')
|
||||||
|
docs_prompt += f"\n==file name==\n\n{file_path}\n\n==file content==\n\n{f.read().strip()}\n=========\n\n"
|
||||||
|
except Exception as e:
|
||||||
|
get_logger().error(f"Error while reading the file {file}: {e}")
|
||||||
|
token_count = self.token_handler.count_tokens(docs_prompt)
|
||||||
|
get_logger().debug(f"Token count of full documentation website: {token_count}")
|
||||||
|
|
||||||
|
model = get_settings().config.model
|
||||||
|
max_tokens_full = MAX_TOKENS[model] # note - here we take the actual max tokens, without any reductions. we do aim to get the full documentation website in the prompt
|
||||||
|
delta_output = 2000
|
||||||
|
if token_count > max_tokens_full - delta_output:
|
||||||
|
get_logger().info(f"Token count {token_count} exceeds the limit {max_tokens_full - delta_output}. Skipping the PR Help message.")
|
||||||
|
docs_prompt = clip_tokens(docs_prompt, max_tokens_full - delta_output)
|
||||||
|
self.vars['snippets'] = docs_prompt.strip()
|
||||||
|
|
||||||
# run the AI model
|
# run the AI model
|
||||||
response = await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.REGULAR)
|
response = await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.REGULAR)
|
||||||
response_yaml = load_yaml(response)
|
response_yaml = load_yaml(response)
|
||||||
response_str = response_yaml.get('response')
|
response_str = response_yaml.get('response')
|
||||||
relevant_snippets_numbers = response_yaml.get('relevant_snippets')
|
relevant_sections = response_yaml.get('relevant_sections')
|
||||||
|
|
||||||
if not relevant_snippets_numbers:
|
if not relevant_sections:
|
||||||
get_logger().info(f"Could not find relevant snippets for the question: {self.question_str}")
|
get_logger().info(f"Could not find relevant answer for the question: {self.question_str}")
|
||||||
if get_settings().config.publish_output:
|
if get_settings().config.publish_output:
|
||||||
answer_str = f"### Question: \n{self.question_str}\n\n"
|
answer_str = f"### Question: \n{self.question_str}\n\n"
|
||||||
answer_str += f"### Answer:\n\n"
|
answer_str += f"### Answer:\n\n"
|
||||||
@ -202,16 +134,15 @@ class PRHelpMessage:
|
|||||||
answer_str += f"### Question: \n{self.question_str}\n\n"
|
answer_str += f"### Question: \n{self.question_str}\n\n"
|
||||||
answer_str += f"### Answer:\n{response_str.strip()}\n\n"
|
answer_str += f"### Answer:\n{response_str.strip()}\n\n"
|
||||||
answer_str += f"#### Relevant Sources:\n\n"
|
answer_str += f"#### Relevant Sources:\n\n"
|
||||||
paged_published = []
|
base_path = "https://qodo-merge-docs.qodo.ai/"
|
||||||
for page in relevant_snippets_numbers:
|
for section in relevant_sections:
|
||||||
page = int(page - 1)
|
file = section.get('file_name').strip().removesuffix('.md')
|
||||||
if page < len(relevant_pages_full) and page >= 0:
|
if str(section['relevant_section_header_string']).strip():
|
||||||
if relevant_pages_full[page] in paged_published:
|
markdown_header = section['relevant_section_header_string'].strip().strip('#').strip().lower().replace(' ', '-')
|
||||||
continue
|
answer_str += f"> - {base_path}{file}#{markdown_header}\n"
|
||||||
link = f"{relevant_pages_full[page]}{relevant_snippets_full_header[page]}"
|
else:
|
||||||
# answer_str += f"> - [{relevant_pages_full[page]}]({link})\n"
|
answer_str += f"> - {base_path}{file}\n"
|
||||||
answer_str += f"> - {link}\n"
|
|
||||||
paged_published.append(relevant_pages_full[page])
|
|
||||||
|
|
||||||
# publish the answer
|
# publish the answer
|
||||||
if get_settings().config.publish_output:
|
if get_settings().config.publish_output:
|
||||||
|
@ -28,12 +28,6 @@ gunicorn==22.0.0
|
|||||||
pytest-cov==5.0.0
|
pytest-cov==5.0.0
|
||||||
pydantic==2.8.2
|
pydantic==2.8.2
|
||||||
html2text==2024.2.26
|
html2text==2024.2.26
|
||||||
# help bot
|
|
||||||
langchain==0.3.0
|
|
||||||
langchain-openai==0.2.0
|
|
||||||
langchain-pinecone==0.2.0
|
|
||||||
langchain-chroma==0.1.4
|
|
||||||
chromadb==0.5.7
|
|
||||||
# Uncomment the following lines to enable the 'similar issue' tool
|
# Uncomment the following lines to enable the 'similar issue' tool
|
||||||
# pinecone-client
|
# pinecone-client
|
||||||
# pinecone-datasets @ git+https://github.com/mrT23/pinecone-datasets.git@main
|
# pinecone-datasets @ git+https://github.com/mrT23/pinecone-datasets.git@main
|
||||||
|
Reference in New Issue
Block a user