Merge pull request #1306 from Codium-ai/tr/help

Tr/help
2025-07-21 04:50:39 +08:00 · 2024-10-25 17:02:24 +03:00
parent c077c71fdb 3b47c75c32
commit a221f8edd0
5 changed files with 63 additions and 134 deletions
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -1,7 +1,6 @@
 FROM python:3.12.3 AS base
 WORKDIR /app
 ADD docs/chroma_db.zip /app/docs/chroma_db.zip
 ADD pyproject.toml .
 ADD requirements.txt .
 RUN pip install . && rm pyproject.toml requirements.txt
--- a/docs/chroma_db.zip
+++ b/docs/chroma_db.zip
--- a/pr_agent/settings/pr_help_prompts.toml
+++ b/pr_agent/settings/pr_help_prompts.toml
@ -1,20 +1,24 @@
 [pr_help_prompts]
 system="""You are Doc-helper, a language models designed to answer questions about a documentation website for an open-soure project called "PR-Agent" (recently renamed to "Qodo Merge").
-You will recieve a question, and a list of snippets that were collected for a documentation site using RAG as the retrieval method.
+You will recieve a question, and the full documentation website content.
-Your goal is to provide the best answer to the question using the snippets provided.
+Your goal is to provide the best answer to the question using the documentation provided.
 Additional instructions:
- Try to be short and concise in your answers. Give examples if needed.
+- Try to be short and concise in your answers. Try to give examples if needed.
 - It is possible some of the snippets may not be relevant to the question. In that case, you should ignore them and focus on the ones that are relevant.
 - The main tools of PR-Agent are 'describe', 'review', 'improve'. If there is ambiguity to which tool the user is referring to, prioritize snippets of these tools over others.
 - If the question has ambiguity and can relate to different tools or platfroms, provide the best answer possible based on what is available, but also state in your answer what additional information would be needed to give a more accurate answer.
 The output must be a YAML object equivalent to type $DocHelper, according to the following Pydantic definitions:
 =====
 class relevant_section(BaseModel):
    file_name: str = Field(description="The name of the relevant file")
    relevant_section_header_string: str = Field(description="From the relevant file, exact text of the relevant section heading. If no markdown heading is relevant, return empty string")
 class DocHelper(BaseModel):
    user_question: str = Field(description="The user's question")
    response: str = Field(description="The response to the user's question")
-    relevant_snippets: List[int] = Field(description="One-based index of the relevant snippets in the list of snippets provided. Order the by relevance, with the most relevant first. If a snippet was not relevant, do not include it in the list.")
+    relevant_sections: List[relevant_section] = Field(description="A list of the relevant markdown sections in the documentation that answer the user's question, ordered by importance (most relevant first)")
 =====
@ -24,10 +28,11 @@ user_question: |
  ...
 response: |
  ...
-relevant_snippets:
+relevant_sections:
- 2
+- file_name: "src/file1.py"
- 1
+  relevant_section_header_string: |
- 4
+    ...
 - ...
 """
 user="""\
@ -37,7 +42,7 @@ User's Question:
 =====
-Relevant doc snippets retrieved:
+Documentation website content:
 =====
 {{ snippets|trim }}
 =====
--- a/pr_agent/tools/pr_help_message.py
+++ b/pr_agent/tools/pr_help_message.py
@ -1,19 +1,17 @@
 import os
 import traceback
 import zipfile
 import tempfile
 import copy
 from functools import partial
 from pathlib import Path
 from jinja2 import Environment, StrictUndefined
 from pr_agent.algo import MAX_TOKENS
 from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler
 from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler
 from pr_agent.algo.pr_processing import retry_with_fallback_models
 from pr_agent.algo.token_handler import TokenHandler
-from pr_agent.algo.utils import ModelType, load_yaml
+from pr_agent.algo.utils import ModelType, load_yaml, clip_tokens
 from pr_agent.config_loader import get_settings
-from pr_agent.git_providers import get_git_provider, GithubProvider, BitbucketServerProvider, \
+from pr_agent.git_providers import GithubProvider, BitbucketServerProvider, \
    get_git_provider_with_context
 from pr_agent.log import get_logger
@ -67,83 +65,6 @@ class PRHelpMessage:
            question_str = ""
        return question_str
    def get_sim_results_from_s3_db(self, embeddings):
        get_logger().info("Loading the S3 index...")
        sim_results = []
        try:
            from langchain_chroma import Chroma
            from urllib import request
            with tempfile.TemporaryDirectory() as temp_dir:
                # Define the local file path within the temporary directory
                local_file_path = os.path.join(temp_dir, 'chroma_db.zip')
                bucket = 'pr-agent'
                file_name = 'chroma_db.zip'
                s3_url = f'https://{bucket}.s3.amazonaws.com/{file_name}'
                request.urlretrieve(s3_url, local_file_path)
                # # Download the file from S3 to the temporary directory
                # s3 = boto3.client('s3')
                # s3.download_file(bucket, file_name, local_file_path)
                # Extract the contents of the zip file
                with zipfile.ZipFile(local_file_path, 'r') as zip_ref:
                    zip_ref.extractall(temp_dir)
                vectorstore = Chroma(persist_directory=temp_dir + "/chroma_db",
                                     embedding_function=embeddings)
                sim_results = vectorstore.similarity_search_with_score(self.question_str, k=self.num_retrieved_snippets)
        except Exception as e:
            get_logger().error(f"Error while getting sim from S3: {e}",
                               artifact={"traceback": traceback.format_exc()})
        return sim_results
    def get_sim_results_from_local_db(self, embeddings):
        get_logger().info("Loading the local index...")
        sim_results = []
        try:
            from langchain_chroma import Chroma
            get_logger().info("Loading the Chroma index...")
            db_path = "./docs/chroma_db.zip"
            if not os.path.exists(db_path):
                db_path= "/app/docs/chroma_db.zip"
                if not os.path.exists(db_path):
                    get_logger().error("Local db not found")
                    return sim_results
            with tempfile.TemporaryDirectory() as temp_dir:
                # Extract the ZIP file
                with zipfile.ZipFile(db_path, 'r') as zip_ref:
                    zip_ref.extractall(temp_dir)
                vectorstore = Chroma(persist_directory=temp_dir + "/chroma_db",
                                     embedding_function=embeddings)
                # Do similarity search
                sim_results = vectorstore.similarity_search_with_score(self.question_str, k=self.num_retrieved_snippets)
        except Exception as e:
            get_logger().error(f"Error while getting sim from local db: {e}",
                               artifact={"traceback": traceback.format_exc()})
        return sim_results
    def get_sim_results_from_pinecone_db(self, embeddings):
        get_logger().info("Loading the Pinecone index...")
        sim_results = []
        try:
            from langchain_pinecone import PineconeVectorStore
            INDEX_NAME = "pr-agent-docs"
            vectorstore = PineconeVectorStore(
                index_name=INDEX_NAME, embedding=embeddings,
                pinecone_api_key=get_settings().pinecone.api_key
            )
            # Do similarity search
            sim_results = vectorstore.similarity_search_with_score(self.question_str, k=self.num_retrieved_snippets)
        except Exception as e:
            get_logger().error(f"Error while getting sim from Pinecone db: {e}",
                               artifact={"traceback": traceback.format_exc()})
        return sim_results
    async def run(self):
        try:
            if self.question_str:
@ -157,38 +78,49 @@ class PRHelpMessage:
                        get_logger().error("The `Help` tool chat feature requires an OpenAI API key for calculating embeddings")
                    return
-                # Initialize embeddings
+                # current path
-                from langchain_openai import OpenAIEmbeddings
+                docs_path= Path(__file__).parent.parent.parent / 'docs' / 'docs'
-                embeddings = OpenAIEmbeddings(model="text-embedding-3-small",
+                # get all the 'md' files inside docs_path and its subdirectories
-                                              api_key=get_settings().openai.key)
+                md_files = list(docs_path.glob('**/*.md'))
                folders_to_exclude = ['/finetuning_benchmark/']
                files_to_exclude = {'EXAMPLE_BEST_PRACTICE.md', 'compression_strategy.md', '/docs/overview/index.md'}
                md_files = [file for file in md_files if not any(folder in str(file) for folder in folders_to_exclude) and not any(file.name == file_to_exclude for file_to_exclude in files_to_exclude)]
-                # Get similar snippets via similarity search
+                # sort the 'md_files' so that 'priority_files' will be at the top
-                if get_settings().pr_help.force_local_db:
+                priority_files_strings = ['/docs/index.md', '/usage-guide', 'tools/describe.md', 'tools/review.md',
-                    sim_results = self.get_sim_results_from_local_db(embeddings)
+                                          'tools/improve.md', '/faq']
-                elif get_settings().get('pinecone.api_key'):
+                md_files_priority = [file for file in md_files if
-                    sim_results = self.get_sim_results_from_pinecone_db(embeddings)
+                                     any(priority_string in str(file) for priority_string in priority_files_strings)]
-                else:
+                md_files_not_priority = [file for file in md_files if file not in md_files_priority]
-                    sim_results = self.get_sim_results_from_s3_db(embeddings)
+                md_files = md_files_priority + md_files_not_priority
                    if not sim_results:
                        get_logger().info("Failed to load the S3 index. Loading the local index...")
                        sim_results = self.get_sim_results_from_local_db(embeddings)
                if not sim_results:
                    get_logger().error("Failed to retrieve similar snippets. Exiting...")
                    return
-                # Prepare relevant snippets
+                docs_prompt = ""
-                relevant_pages_full, relevant_snippets_full_header, relevant_snippets_str =\
+                for file in md_files:
-                    await self.prepare_relevant_snippets(sim_results)
+                    try:
-                self.vars['snippets'] = relevant_snippets_str.strip()
+                        with open(file, 'r') as f:
                            file_path = str(file).replace(str(docs_path), '')
                            docs_prompt += f"\n==file name==\n\n{file_path}\n\n==file content==\n\n{f.read().strip()}\n=========\n\n"
                    except Exception as e:
                        get_logger().error(f"Error while reading the file {file}: {e}")
                token_count = self.token_handler.count_tokens(docs_prompt)
                get_logger().debug(f"Token count of full documentation website: {token_count}")
                model = get_settings().config.model
                max_tokens_full = MAX_TOKENS[model] # note - here we take the actual max tokens, without any reductions. we do aim to get the full documentation website in the prompt
                delta_output = 2000
                if token_count > max_tokens_full - delta_output:
                    get_logger().info(f"Token count {token_count} exceeds the limit {max_tokens_full - delta_output}. Skipping the PR Help message.")
                    docs_prompt = clip_tokens(docs_prompt, max_tokens_full - delta_output)
                self.vars['snippets'] = docs_prompt.strip()
                # run the AI model
                response = await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.REGULAR)
                response_yaml = load_yaml(response)
                response_str = response_yaml.get('response')
-                relevant_snippets_numbers = response_yaml.get('relevant_snippets')
+                relevant_sections = response_yaml.get('relevant_sections')
-                if not relevant_snippets_numbers:
+                if not relevant_sections:
-                    get_logger().info(f"Could not find relevant snippets for the question: {self.question_str}")
+                    get_logger().info(f"Could not find relevant answer for the question: {self.question_str}")
                    if get_settings().config.publish_output:
                        answer_str = f"### Question: \n{self.question_str}\n\n"
                        answer_str += f"### Answer:\n\n"
@ -202,16 +134,15 @@ class PRHelpMessage:
                    answer_str += f"### Question: \n{self.question_str}\n\n"
                    answer_str += f"### Answer:\n{response_str.strip()}\n\n"
                    answer_str += f"#### Relevant Sources:\n\n"
-                    paged_published = []
+                    base_path = "https://qodo-merge-docs.qodo.ai/"
-                    for page in relevant_snippets_numbers:
+                    for section in relevant_sections:
-                        page = int(page - 1)
+                        file = section.get('file_name').strip().removesuffix('.md')
-                        if page < len(relevant_pages_full) and page >= 0:
+                        if str(section['relevant_section_header_string']).strip():
-                            if relevant_pages_full[page] in paged_published:
+                            markdown_header = section['relevant_section_header_string'].strip().strip('#').strip().lower().replace(' ', '-')
-                                continue
+                            answer_str += f"> - {base_path}{file}#{markdown_header}\n"
-                            link = f"{relevant_pages_full[page]}{relevant_snippets_full_header[page]}"
+                        else:
-                            # answer_str += f"> - [{relevant_pages_full[page]}]({link})\n"
+                            answer_str += f"> - {base_path}{file}\n"
-                            answer_str += f"> - {link}\n"
+
                            paged_published.append(relevant_pages_full[page])
                # publish the answer
                if get_settings().config.publish_output:
--- a/requirements.txt
+++ b/requirements.txt
@ -28,12 +28,6 @@ gunicorn==22.0.0
 pytest-cov==5.0.0
 pydantic==2.8.2
 html2text==2024.2.26
 # help bot
 langchain==0.3.0
 langchain-openai==0.2.0
 langchain-pinecone==0.2.0
 langchain-chroma==0.1.4
 chromadb==0.5.7
 # Uncomment the following lines to enable the 'similar issue' tool
 # pinecone-client
 # pinecone-datasets @ git+https://github.com/mrT23/pinecone-datasets.git@main