Merge remote-tracking branch 'origin/main'

# Conflicts: # pr_agent/tools/pr_description.py
2025-07-21 04:50:39 +08:00 · 2024-09-21 21:10:38 +03:00
parent 90295b6429 08d6bbc94c
commit 835684b92a
10 changed files with 399 additions and 95 deletions
--- a/README.md
+++ b/README.md
@ -43,6 +43,12 @@ CodiumAI PR-Agent aims to help efficiently review and handle pull requests, by p
  
 ## News and Updates

+### September 21, 2024
+Need help with PR-Agent? New feature - simply comment `/help "your question"` in a pull request, and PR-Agent will provide you with the [relevant documentation](https://github.com/Codium-ai/pr-agent/pull/1241#issuecomment-2365259334).
+
+<kbd><img src="https://www.codium.ai/images/pr_agent/pr_help_chat.png" width="768"></kbd>
+
+
 ### September 12, 2024
 [Dynamic context](https://pr-agent-docs.codium.ai/core-abilities/dynamic_context/) is now the default option for context extension. 
 This feature enables PR-Agent to dynamically adjusting the relevant context for each code hunk, while avoiding overflowing the model with too much information.
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -1,6 +1,7 @@
 FROM python:3.12.3 AS base

 WORKDIR /app
+ADD docs/chroma_db.zip /app/docs/chroma_db.zip
 ADD pyproject.toml .
 ADD requirements.txt .
 RUN pip install . && rm pyproject.toml requirements.txt
--- a/docs/chroma_db.zip
+++ b/docs/chroma_db.zip
--- a/docs/docs/index.md
+++ b/docs/docs/index.md
@ -8,6 +8,7 @@ CodiumAI PR-Agent is an open-source tool to help efficiently review and handle p

 - See the [Tools Guide](./tools/index.md) for a detailed description of the different tools.

+To search the documentation site using natural language, simply comment `/help "your question"` in a pull request where PR-Agent is installed. PR-Agent will then provide you with an [answer](https://github.com/Codium-ai/pr-agent/pull/1241#issuecomment-2365259334), including relevant documentation links.

 ## PR-Agent Features
 PR-Agent offers extensive pull request functionalities across various git providers.
--- a/pr_agent/config_loader.py
+++ b/pr_agent/config_loader.py
@ -27,8 +27,9 @@ global_settings = Dynaconf(
        "settings/pr_update_changelog_prompts.toml",
        "settings/pr_custom_labels.toml",
        "settings/pr_add_docs.toml",
+        "settings/custom_labels.toml",
+        "settings/pr_help_prompts.toml",
        "settings_prod/.secrets.toml",
-        "settings/custom_labels.toml"
    ]]
 )

--- a/pr_agent/settings/configuration.toml
+++ b/pr_agent/settings/configuration.toml
@ -183,6 +183,7 @@ enable_help_text=true
 final_update_message = false

 [pr_help] # /help #
+force_local_db=false

 [pr_config] # /config #

--- a/pr_agent/settings/pr_help_prompts.toml
+++ b/pr_agent/settings/pr_help_prompts.toml
@ -0,0 +1,43 @@
+[pr_help_prompts]
+system="""You are Doc-helper, a language models designed to answer questions about a documentation website for an open-soure project called "PR-Agent".
+You will recieve a question, and a list of snippets that were collected for a documentation site using RAG as the retrieval method.
+Your goal is to provide the best answer to the question using the snippets provided.
+Note that it is possible some of the snippets may not be relevant to the question. In that case, you should ignore them and focus on the ones that are relevant.
+
+Try to be short and concise in your answers.
+
+The output must be a YAML object equivalent to type $doc_help, according to the following Pydantic definitions:
+
+class doc_help(BaseModel):
+    user_question: str = Field(description="The user's question")
+    response: str = Field(description="The response to the user's question")
+    relevant_snippets: List[int] = Field(description="One-based index of the relevant snippets in the list of snippets provided. Order the by relevance, with the most relevant first. If a snippet was not relevant, do not include it in the list.")
+
+Example output:
+```yaml
+user_question: |
+  ...
+response: |
+    ...
+relevant_snippets:
+    - 1
+    - 2
+    - 4
+"""
+
+user="""\
+User's Question:
+=====
+{{ question|trim }}
+=====
+
+
+Relevant doc snippets retrieved:
+=====
+{{ snippets|trim }}
+=====
+
+
+Response (should be a valid YAML, and nothing else):
+```yaml
+"""
--- a/pr_agent/tools/pr_description.py
+++ b/pr_agent/tools/pr_description.py
@ -117,8 +117,9 @@ class PRDescription:
                pr_body += "<hr>\n\n<details> <summary><strong>✨ Describe tool usage guide:</strong></summary><hr> \n\n"
                pr_body += HelpMessage.get_describe_usage_guide()
                pr_body += "\n</details>\n"
-            elif get_settings().pr_description.enable_help_comment:
-                pr_body += '\n\n___\n\n> 💡 **PR-Agent usage**: Comment `/help "your question"` on any pull request to receive relevant information'
+            elif self.git_provider.is_supported("gfm_markdown") and get_settings().pr_description.enable_help_comment:
+                pr_body += "\n\n___\n\n> 💡 **PR-Agent usage**:"
+                pr_body += '\n>Need PR-Agent help? Comment `/help "your question"` on any pull request to receive relevant information'

            # Output the relevant configurations if enabled
            if get_settings().get('config', {}).get('output_relevant_configurations', False):
--- a/pr_agent/tools/pr_help_message.py
+++ b/pr_agent/tools/pr_help_message.py
@ -1,15 +1,215 @@
+import os
+import traceback
+import zipfile
+import tempfile
+import copy
+from functools import partial
+
+from jinja2 import Environment, StrictUndefined
+
+from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler
+from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler
+from pr_agent.algo.pr_processing import retry_with_fallback_models
+from pr_agent.algo.token_handler import TokenHandler
+from pr_agent.algo.utils import ModelType, load_yaml
 from pr_agent.config_loader import get_settings
-from pr_agent.git_providers import get_git_provider, GithubProvider
+from pr_agent.git_providers import get_git_provider, GithubProvider, BitbucketServerProvider, \
+    get_git_provider_with_context
 from pr_agent.log import get_logger


+def extract_header(snippet):
+    res = ''
+    lines = snippet.split('===Snippet content===')[0].split('\n')
+    highest_header = ''
+    highest_level = float('inf')
+    for line in lines[::-1]:
+        line = line.strip()
+        if line.startswith('Header '):
+            highest_header = line.split(': ')[1]
+    if highest_header:
+        res = f"#{highest_header.lower().replace(' ', '-')}"
+    return res
+
 class PRHelpMessage:
-    def __init__(self, pr_url: str, args=None, ai_handler=None):
-        self.git_provider = get_git_provider()(pr_url)
+    def __init__(self, pr_url: str, args=None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler):
+        self.git_provider = get_git_provider_with_context(pr_url)
+        self.ai_handler = ai_handler()
+        self.question_str = self.parse_args(args)
+        if self.question_str:
+            self.vars = {
+                "question": self.question_str,
+                "snippets": "",
+            }
+            self.token_handler = TokenHandler(None,
+                                              self.vars,
+                                              get_settings().pr_help_prompts.system,
+                                              get_settings().pr_help_prompts.user)
+
+    async def _prepare_prediction(self, model: str):
+        try:
+            variables = copy.deepcopy(self.vars)
+            environment = Environment(undefined=StrictUndefined)
+            system_prompt = environment.from_string(get_settings().pr_help_prompts.system).render(variables)
+            user_prompt = environment.from_string(get_settings().pr_help_prompts.user).render(variables)
+            response, finish_reason = await self.ai_handler.chat_completion(
+                model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt)
+            return response
+        except Exception as e:
+            get_logger().error(f"Error while preparing prediction: {e}")
+            return ""
+
+    def parse_args(self, args):
+        if args and len(args) > 0:
+            question_str = " ".join(args)
+        else:
+            question_str = ""
+        return question_str
+
+    def get_sim_results_from_s3_db(self, embeddings):
+        get_logger().info("Loading the S3 index...")
+        sim_results = []
+        try:
+            from langchain_chroma import Chroma
+            from urllib import request
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Define the local file path within the temporary directory
+                local_file_path = os.path.join(temp_dir, 'chroma_db.zip')
+
+                bucket = 'pr-agent'
+                file_name = 'chroma_db.zip'
+                s3_url = f'https://{bucket}.s3.amazonaws.com/{file_name}'
+                request.urlretrieve(s3_url, local_file_path)
+
+                # # Download the file from S3 to the temporary directory
+                # s3 = boto3.client('s3')
+                # s3.download_file(bucket, file_name, local_file_path)
+
+                # Extract the contents of the zip file
+                with zipfile.ZipFile(local_file_path, 'r') as zip_ref:
+                    zip_ref.extractall(temp_dir)
+
+                vectorstore = Chroma(persist_directory=temp_dir + "/chroma_db",
+                                     embedding_function=embeddings)
+                sim_results = vectorstore.similarity_search_with_score(self.question_str, k=4)
+        except Exception as e:
+            get_logger().error(f"Error while getting sim from S3: {e}",
+                               artifact={"traceback": traceback.format_exc()})
+        return sim_results
+
+    def get_sim_results_from_local_db(self, embeddings):
+        get_logger().info("Loading the local index...")
+        sim_results = []
+        try:
+            from langchain_chroma import Chroma
+            get_logger().info("Loading the Chroma index...")
+            db_path = "./docs/chroma_db.zip"
+            if not os.path.exists(db_path):
+                db_path= "/app/docs/chroma_db.zip"
+                if not os.path.exists(db_path):
+                    get_logger().error("Local db not found")
+                    return sim_results
+            with tempfile.TemporaryDirectory() as temp_dir:
+
+                # Extract the ZIP file
+                with zipfile.ZipFile(db_path, 'r') as zip_ref:
+                    zip_ref.extractall(temp_dir)
+
+                vectorstore = Chroma(persist_directory=temp_dir + "/chroma_db",
+                                     embedding_function=embeddings)
+
+                # Do similarity search
+                sim_results = vectorstore.similarity_search_with_score(self.question_str, k=4)
+        except Exception as e:
+            get_logger().error(f"Error while getting sim from local db: {e}",
+                               artifact={"traceback": traceback.format_exc()})
+        return sim_results
+
+    def get_sim_results_from_pinecone_db(self, embeddings):
+        get_logger().info("Loading the Pinecone index...")
+        sim_results = []
+        try:
+            from langchain_pinecone import PineconeVectorStore
+            INDEX_NAME = "pr-agent-docs"
+            vectorstore = PineconeVectorStore(
+                index_name=INDEX_NAME, embedding=embeddings,
+                pinecone_api_key=get_settings().pinecone.api_key
+            )
+
+            # Do similarity search
+            sim_results = vectorstore.similarity_search_with_score(self.question_str, k=4)
+        except Exception as e:
+            get_logger().error(f"Error while getting sim from Pinecone db: {e}",
+                               artifact={"traceback": traceback.format_exc()})
+        return sim_results

    async def run(self):
        try:
-            if not self.git_provider.is_supported("gfm_markdown"):
+            if self.question_str:
+                get_logger().info(f'Answering a PR question about the PR {self.git_provider.pr_url} ')
+
+                if not get_settings().get('openai.key'):
+                    if get_settings().config.publish_output:
+                        self.git_provider.publish_comment(
+                            "The `Help` tool chat feature requires an OpenAI API key for calculating embeddings")
+                    else:
+                        get_logger().error("The `Help` tool chat feature requires an OpenAI API key for calculating embeddings")
+                    return
+
+                # Initialize embeddings
+                from langchain_openai import OpenAIEmbeddings
+                embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",
+                                              api_key=get_settings().openai.key)
+
+                # Get similar snippets via similarity search
+                if get_settings().pr_help.force_local_db:
+                    sim_results = self.get_sim_results_from_local_db(embeddings)
+                elif get_settings().get('pinecone.api_key'):
+                    sim_results = self.get_sim_results_from_pinecone_db(embeddings)
+                else:
+                    sim_results = self.get_sim_results_from_s3_db(embeddings)
+                    if not sim_results:
+                        get_logger().info("Failed to load the S3 index. Loading the local index...")
+                        sim_results = self.get_sim_results_from_local_db(embeddings)
+                if not sim_results:
+                    get_logger().error("Failed to retrieve similar snippets. Exiting...")
+                    return
+
+                # Prepare relevant snippets
+                relevant_pages_full, relevant_snippets_full_header, relevant_snippets_str =\
+                    await self.prepare_relevant_snippets(sim_results)
+                self.vars['snippets'] = relevant_snippets_str.strip()
+
+                # run the AI model
+                response = await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.REGULAR)
+                response_yaml = load_yaml(response)
+                response_str = response_yaml.get('response')
+                relevant_snippets_numbers = response_yaml.get('relevant_snippets')
+
+                # prepare the answer
+                answer_str = ""
+                if response_str:
+                    answer_str += f"### Question: \n{self.question_str}\n\n"
+                    answer_str += f"### Answer:\n{response_str.strip()}\n\n"
+                    answer_str += f"#### Relevant Sources:\n\n"
+                    paged_published = []
+                    for page in relevant_snippets_numbers:
+                        page = int(page - 1)
+                        if page < len(relevant_pages_full) and page >= 0:
+                            if relevant_pages_full[page] in paged_published:
+                                continue
+                            link = f"{relevant_pages_full[page]}{relevant_snippets_full_header[page]}"
+                            # answer_str += f"> - [{relevant_pages_full[page]}]({link})\n"
+                            answer_str += f"> - {link}\n"
+                            paged_published.append(relevant_pages_full[page])
+
+                # publish the answer
+                if get_settings().config.publish_output:
+                    self.git_provider.publish_comment(answer_str)
+                else:
+                    get_logger().info(f"Answer: {response}")
+            else:
+                if not isinstance(self.git_provider, BitbucketServerProvider) and not self.git_provider.is_supported("gfm_markdown"):
                    self.git_provider.publish_comment(
                        "The `Help` tool requires gfm markdown, which is not supported by your code platform.")
                    return
@ -93,14 +293,58 @@ class PRHelpMessage:
                    pr_comment += "</table>\n\n"
                    pr_comment += f"""\n\n(1) Note that each tool be [triggered automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-app-automatic-tools-when-a-new-pr-is-opened) when a new PR is opened, or called manually by [commenting on a PR](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#online-usage)."""
                    pr_comment += f"""\n\n(2) Tools marked with [*] require additional parameters to be passed. For example, to invoke the `/ask` tool, you need to comment on a PR: `/ask "<question content>"`. See the relevant documentation for each tool for more details."""
+                elif isinstance(self.git_provider, BitbucketServerProvider):
+                    # only support basic commands in BBDC
+                    pr_comment = generate_bbdc_table(tool_names[:4], descriptions[:4])
                else:
                    pr_comment += f"<table><tr align='left'><th align='left'>Tool</th><th align='left'>Command</th><th align='left'>Description</th></tr>"
                    for i in range(len(tool_names)):
                        pr_comment += f"\n<tr><td align='left'>\n\n<strong>{tool_names[i]}</strong></td><td>{commands[i]}</td><td>{descriptions[i]}</td></tr>"
                    pr_comment += "</table>\n\n"
                    pr_comment += f"""\n\nNote that each tool be [invoked automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/) when a new PR is opened, or called manually by [commenting on a PR](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#online-usage)."""
+
                if get_settings().config.publish_output:
                    self.git_provider.publish_comment(pr_comment)
        except Exception as e:
-            get_logger().error(f"Error while running PRHelpMessage: {e}")
+            get_logger().exception(f"Error while running PRHelpMessage: {e}")
        return ""
+
+    async def prepare_relevant_snippets(self, sim_results):
+        # Get relevant snippets
+        relevant_snippets_full = []
+        relevant_pages_full = []
+        relevant_snippets_full_header = []
+        th = 0.75
+        for s in sim_results:
+            page = s[0].metadata['source']
+            content = s[0].page_content
+            score = s[1]
+            relevant_snippets_full.append(content)
+            relevant_snippets_full_header.append(extract_header(content))
+            relevant_pages_full.append(page)
+        # build the snippets string
+        relevant_snippets_str = ""
+        for i, s in enumerate(relevant_snippets_full):
+            relevant_snippets_str += f"Snippet {i}:\n\n{s}\n\n"
+            relevant_snippets_str += "-------------------\n\n"
+        return relevant_pages_full, relevant_snippets_full_header, relevant_snippets_str
+
+
+def generate_bbdc_table(column_arr_1, column_arr_2):
+    # Generating header row
+    header_row = "| Tool  | Description | \n"
+
+    # Generating separator row
+    separator_row = "|--|--|\n"
+
+    # Generating data rows
+    data_rows = ""
+    max_len = max(len(column_arr_1), len(column_arr_2))
+    for i in range(max_len):
+        col1 = column_arr_1[i] if i < len(column_arr_1) else ""
+        col2 = column_arr_2[i] if i < len(column_arr_2) else ""
+        data_rows += f"| {col1} | {col2} |\n"
+
+    # Combine all parts to form the complete table
+    markdown_table = header_row + separator_row + data_rows
+    return markdown_table
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
-aiohttp==3.9.4
+aiohttp==3.9.5
 anthropic[vertex]==0.21.3
 atlassian-python-api==3.41.4
 azure-devops==7.1.0b3
@ -13,7 +13,7 @@ Jinja2==3.1.2
 litellm==1.43.13
 loguru==0.7.2
 msrest==0.7.1
-openai==1.40.6
+openai==1.46.0
 pytest==7.4.0
 PyGithub==1.59.*
 PyYAML==6.0.1
@ -28,6 +28,12 @@ gunicorn==22.0.0
 pytest-cov==5.0.0
 pydantic==2.8.2
 html2text==2024.2.26
+# help bot
+langchain==0.3.0
+langchain-openai==0.2.0
+langchain-pinecone==0.2.0
+langchain-chroma==0.1.4
+chromadb==0.5.7
 # Uncomment the following lines to enable the 'similar issue' tool
 # pinecone-client
 # pinecone-datasets @ git+https://github.com/mrT23/pinecone-datasets.git@main