Merge remote-tracking branch 'origin/main' into fix_bitbucket_publish_description

2025-07-21 04:50:39 +08:00 · 2023-09-10 14:08:17 +03:00
parent 2aef9dfe55 fd63fe4c95
commit 98d0835c48
13 changed files with 382 additions and 21 deletions
--- a/.github/workflows/pr-agent-review.yaml
+++ b/.github/workflows/pr-agent-review.yaml
@ -21,7 +21,10 @@ jobs:
        id: pragent
        uses: Codium-ai/pr-agent@main
        env:
-          OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
-          OPENAI_ORG: ${{ secrets.OPENAI_ORG }} # optional
+          OPENAI.KEY: ${{ secrets.OPENAI_KEY }}
+          OPENAI.ORG: ${{ secrets.OPENAI_ORG }} # optional
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PINECONE.API_KEY: ${{ secrets.PINECONE_API_KEY }}
+          PINECONE.ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT }}
+

--- a/Usage.md
+++ b/Usage.md
@ -50,12 +50,12 @@ When running from your local repo (CLI), your local configuration file will be u

 Examples for invoking the different tools via the CLI:

- **Review**:       `python cli.py --pr_url=<pr_url>  /review`
- **Describe**:     `python cli.py --pr_url=<pr_url>  /describe`
- **Improve**:      `python cli.py --pr_url=<pr_url>  /improve`
- **Ask**:          `python cli.py --pr_url=<pr_url>  /ask "Write me a poem about this PR"`
- **Reflect**:      `python cli.py --pr_url=<pr_url>  /reflect`
- **Update Changelog**:      `python cli.py --pr_url=<pr_url>  /update_changelog`
+- **Review**:       `python cli.py --pr_url=<pr_url>  review`
+- **Describe**:     `python cli.py --pr_url=<pr_url>  describe`
+- **Improve**:      `python cli.py --pr_url=<pr_url>  improve`
+- **Ask**:          `python cli.py --pr_url=<pr_url>  ask "Write me a poem about this PR"`
+- **Reflect**:      `python cli.py --pr_url=<pr_url>  reflect`
+- **Update Changelog**:      `python cli.py --pr_url=<pr_url>  update_changelog`

 `<pr_url>` is the url of the relevant PR (for example: https://github.com/Codium-ai/pr-agent/pull/50).

@ -169,6 +169,31 @@ in the configuration.toml

 #### Huggingface

+**Local**  
+You can run Huggingface models locally through either [VLLM](https://docs.litellm.ai/docs/providers/vllm) or [Ollama](https://docs.litellm.ai/docs/providers/ollama)
+
+E.g. to use a new Huggingface model locally via Ollama, set:
+```
+[__init__.py]
+MAX_TOKENS = {
+    "model-name-on-ollama": <max_tokens>
+}
+e.g.
+MAX_TOKENS={
+    ...,
+    "llama2": 4096
+}
+
+
+[config] # in configuration.toml
+model = "ollama/llama2"
+
+[ollama] # in .secrets.toml
+api_base = ... # the base url for your huggingface inference endpoint 
+```
+
+**Inference Endpoints**
+
 To use a new model with Huggingface Inference Endpoints, for example, set:
 ```
 [__init__.py]
--- a/pr_agent/agent/pr_agent.py
+++ b/pr_agent/agent/pr_agent.py
@ -9,6 +9,7 @@ from pr_agent.git_providers import get_git_provider
 from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions
 from pr_agent.tools.pr_description import PRDescription
 from pr_agent.tools.pr_information_from_user import PRInformationFromUser
+from pr_agent.tools.pr_similar_issue import PRSimilarIssue
 from pr_agent.tools.pr_questions import PRQuestions
 from pr_agent.tools.pr_reviewer import PRReviewer
 from pr_agent.tools.pr_update_changelog import PRUpdateChangelog
@ -30,6 +31,7 @@ command2class = {
    "update_changelog": PRUpdateChangelog,
    "config": PRConfig,
    "settings": PRConfig,
+    "similar_issue": PRSimilarIssue,
 }

 commands = list(command2class.keys())
--- a/pr_agent/algo/init.py
+++ b/pr_agent/algo/init.py
@ -1,4 +1,5 @@
 MAX_TOKENS = {
+    'text-embedding-ada-002': 8000,
    'gpt-3.5-turbo': 4000,
    'gpt-3.5-turbo-0613': 4000,
    'gpt-3.5-turbo-0301': 4000,
--- a/pr_agent/algo/ai_handler.py
+++ b/pr_agent/algo/ai_handler.py
@ -1,4 +1,5 @@
 import logging
+import os

 import litellm
 import openai
@ -24,6 +25,11 @@ class AiHandler:
        try:
            openai.api_key = get_settings().openai.key
            litellm.openai_key = get_settings().openai.key
+            if get_settings().get("litellm.use_client"):
+                litellm_token = get_settings().get("litellm.LITELLM_TOKEN")
+                assert litellm_token, "LITELLM_TOKEN is required"
+                os.environ["LITELLM_TOKEN"] = litellm_token
+                litellm.use_client = True
            self.azure = False
            if get_settings().get("OPENAI.ORG", None):
                litellm.organization = get_settings().openai.org
--- a/pr_agent/algo/token_handler.py
+++ b/pr_agent/algo/token_handler.py
@ -21,7 +21,7 @@ class TokenHandler:
      method.
    """

-    def __init__(self, pr, vars: dict, system, user):
+    def __init__(self, pr=None, vars: dict = {}, system="", user=""):
        """
        Initializes the TokenHandler object.

@ -32,7 +32,8 @@ class TokenHandler:
        - user: The user string.
        """
        self.encoder = get_token_encoder()
-        self.prompt_tokens = self._get_system_user_tokens(pr, self.encoder, vars, system, user)
+        if pr is not None:
+            self.prompt_tokens = self._get_system_user_tokens(pr, self.encoder, vars, system, user)

    def _get_system_user_tokens(self, pr, encoder, vars: dict, system, user):
        """
--- a/pr_agent/algo/utils.py
+++ b/pr_agent/algo/utils.py
@ -174,7 +174,7 @@ def fix_json_escape_char(json_message=None):
    Raises:
        None

-    """    
+    """
    try:
        result = json.loads(json_message)
    except Exception as e:
@ -201,7 +201,7 @@ def convert_str_to_datetime(date_str):
    Example:
        >>> convert_str_to_datetime('Mon, 01 Jan 2022 12:00:00 UTC')
        datetime.datetime(2022, 1, 1, 12, 0, 0)
-    """    
+    """
    datetime_format = '%a, %d %b %Y %H:%M:%S %Z'
    return datetime.strptime(date_str, datetime_format)

--- a/pr_agent/cli.py
+++ b/pr_agent/cli.py
@ -17,6 +17,7 @@ For example:
 - cli.py --pr_url=... improve
 - cli.py --pr_url=... ask "write me a poem about this PR"
 - cli.py --pr_url=... reflect
+- cli.py --issue_url=... similar_issue

 Supported commands:
 -review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement.
@ -37,14 +38,22 @@ Configuration:
 To edit any configuration parameter from 'configuration.toml', just add -config_path=<value>.
 For example: 'python cli.py --pr_url=... review --pr_reviewer.extra_instructions="focus on the file: ..."'
 """)
-    parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', required=True)
+    parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', default=None)
+    parser.add_argument('--issue_url', type=str, help='The URL of the Issue to review', default=None)
    parser.add_argument('command', type=str, help='The', choices=commands, default='review')
    parser.add_argument('rest', nargs=argparse.REMAINDER, default=[])
    args = parser.parse_args(inargs)
+    if not args.pr_url and not args.issue_url:
+        parser.print_help()
+        return
+
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
    command = args.command.lower()
    get_settings().set("CONFIG.CLI_MODE", True)
-    result = asyncio.run(PRAgent().handle_request(args.pr_url, command + " " + " ".join(args.rest)))
+    if args.issue_url:
+        result = asyncio.run(PRAgent().handle_request(args.issue_url, command + " " + " ".join(args.rest)))
+    else:
+        result = asyncio.run(PRAgent().handle_request(args.pr_url, command + " " + " ".join(args.rest)))
    if not result:
        parser.print_help()

--- a/pr_agent/git_providers/github_provider.py
+++ b/pr_agent/git_providers/github_provider.py
@ -32,7 +32,7 @@ class GithubProvider(GitProvider):
        self.diff_files = None
        self.git_files = None
        self.incremental = incremental
-        if pr_url:
+        if pr_url and 'pull' in pr_url:
            self.set_pr(pr_url)
            self.last_commit_id = list(self.pr.get_commits())[-1]

@ -309,6 +309,35 @@ class GithubProvider(GitProvider):

        return repo_name, pr_number

+    @staticmethod
+    def _parse_issue_url(issue_url: str) -> Tuple[str, int]:
+        parsed_url = urlparse(issue_url)
+
+        if 'github.com' not in parsed_url.netloc:
+            raise ValueError("The provided URL is not a valid GitHub URL")
+
+        path_parts = parsed_url.path.strip('/').split('/')
+        if 'api.github.com' in parsed_url.netloc:
+            if len(path_parts) < 5 or path_parts[3] != 'issues':
+                raise ValueError("The provided URL does not appear to be a GitHub ISSUE URL")
+            repo_name = '/'.join(path_parts[1:3])
+            try:
+                issue_number = int(path_parts[4])
+            except ValueError as e:
+                raise ValueError("Unable to convert issue number to integer") from e
+            return repo_name, issue_number
+
+        if len(path_parts) < 4 or path_parts[2] != 'issues':
+            raise ValueError("The provided URL does not appear to be a GitHub PR issue")
+
+        repo_name = '/'.join(path_parts[:2])
+        try:
+            issue_number = int(path_parts[3])
+        except ValueError as e:
+            raise ValueError("Unable to convert issue number to integer") from e
+
+        return repo_name, issue_number
+
    def _get_github_client(self):
        deployment_type = get_settings().get("GITHUB.DEPLOYMENT_TYPE", "user")

--- a/pr_agent/settings/.secrets_template.toml
+++ b/pr_agent/settings/.secrets_template.toml
@ -16,6 +16,10 @@ key = ""  # Acquire through https://platform.openai.com
 #deployment_id = ""  # The deployment name you chose when you deployed the engine
 #fallback_deployments = []  # For each fallback model specified in configuration.toml in the [config] section, specify the appropriate deployment_id

+[pinecone]
+api_key = "..."
+environment = "gcp-starter"
+
 [anthropic]
 key = "" # Optional, uncomment if you want to use Anthropic. Acquire through https://www.anthropic.com/

@ -29,6 +33,9 @@ key = "" # Optional, uncomment if you want to use Replicate. Acquire through htt
 key = "" # Optional, uncomment if you want to use Huggingface Inference API. Acquire through https://huggingface.co/docs/api-inference/quicktour
 api_base = "" # the base url for your huggingface inference endpoint 

+[ollama]
+api_base = "" # the base url for your huggingface inference endpoint 
+
 [github]
 # ---- Set the following only for deployment type == "user"
 user_token = ""  # A GitHub personal access token with 'repo' scope.
@ -55,3 +62,5 @@ bearer_token = ""
 app_key = ""
 base_url = ""

+[litellm]
+LITELLM_TOKEN = "" # see https://docs.litellm.ai/docs/debugging/hosted_debugging for details and instructions on how to get a token
--- a/pr_agent/settings/configuration.toml
+++ b/pr_agent/settings/configuration.toml
@ -94,3 +94,16 @@ polling_interval_seconds = 30
 # patch_server_endpoint = "http://127.0.0.1:5000/patch"
 # token to authenticate in the patch server
 # patch_server_token = ""
+
+[litellm]
+#use_client = false
+
+[pr_similar_issue]
+skip_comments = false
+force_update_dataset = false
+max_issues_to_scan = 500
+
+[pinecone]
+# fill and place in .secrets.toml
+#api_key = ...
+# environment = "gcp-starter"
--- a/pr_agent/tools/pr_similar_issue.py
+++ b/pr_agent/tools/pr_similar_issue.py
@ -0,0 +1,261 @@
+import copy
+import json
+import logging
+from enum import Enum
+from typing import List, Tuple
+import pinecone
+import openai
+import pandas as pd
+from pydantic import BaseModel, Field
+
+from pr_agent.algo import MAX_TOKENS
+from pr_agent.algo.token_handler import TokenHandler
+from pr_agent.config_loader import get_settings
+from pr_agent.git_providers import get_git_provider
+from pinecone_datasets import Dataset, DatasetMetadata
+
+MODEL = "text-embedding-ada-002"
+
+
+class PRSimilarIssue:
+    def __init__(self, issue_url: str, args: list = None):
+        if get_settings().config.git_provider != "github":
+            raise Exception("Only github is supported for similar issue tool")
+
+        self.cli_mode = get_settings().CONFIG.CLI_MODE
+        self.max_issues_to_scan = get_settings().pr_similar_issue.max_issues_to_scan
+        self.issue_url = issue_url
+        self.git_provider = get_git_provider()()
+        repo_name, issue_number = self.git_provider._parse_issue_url(issue_url.split('=')[-1])
+        self.git_provider.repo = repo_name
+        self.git_provider.repo_obj = self.git_provider.github_client.get_repo(repo_name)
+        self.token_handler = TokenHandler()
+        repo_obj = self.git_provider.repo_obj
+        repo_name_for_index = self.repo_name_for_index = repo_obj.full_name.lower().replace('/', '-').replace('_/', '-')
+        index_name = self.index_name = "codium-ai-pr-agent-issues"
+
+        # assuming pinecone api key and environment are set in secrets file
+        try:
+            api_key = get_settings().pinecone.api_key
+            environment = get_settings().pinecone.environment
+        except Exception:
+            if not self.cli_mode:
+                repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1])
+                issue_main = self.git_provider.repo_obj.get_issue(original_issue_number)
+                issue_main.create_comment("Please set pinecone api key and environment in secrets file")
+            raise Exception("Please set pinecone api key and environment in secrets file")
+
+        # check if index exists, and if repo is already indexed
+        run_from_scratch = False
+        upsert = True
+        pinecone.init(api_key=api_key, environment=environment)
+        if not index_name in pinecone.list_indexes():
+            run_from_scratch = True
+            upsert = False
+        else:
+            if get_settings().pr_similar_issue.force_update_dataset:
+                upsert = True
+            else:
+                pinecone_index = pinecone.Index(index_name=index_name)
+                res = pinecone_index.fetch([f"example_issue_{repo_name_for_index}"]).to_dict()
+                if res["vectors"]:
+                    upsert = False
+
+        if run_from_scratch or upsert:  # index the entire repo
+            logging.info('Indexing the entire repo...')
+
+            logging.info('Getting issues...')
+            issues = list(repo_obj.get_issues(state='all'))
+            logging.info('Done')
+            self._update_index_with_issues(issues, repo_name_for_index, upsert=upsert)
+        else:  # update index if needed
+            pinecone_index = pinecone.Index(index_name=index_name)
+            issues_to_update = []
+            issues_paginated_list = repo_obj.get_issues(state='all')
+            counter = 1
+            for issue in issues_paginated_list:
+                if issue.pull_request:
+                    continue
+                issue_str, comments, number = self._process_issue(issue)
+                issue_key = f"issue_{number}"
+                id = issue_key + "." + "issue"
+                res = pinecone_index.fetch([id]).to_dict()
+                is_new_issue = True
+                for vector in res["vectors"].values():
+                    if vector['metadata']['repo'] == repo_name_for_index:
+                        is_new_issue = False
+                        break
+                if is_new_issue:
+                    counter += 1
+                    issues_to_update.append(issue)
+                else:
+                    break
+
+            if issues_to_update:
+                logging.info(f'Updating index with {counter} new issues...')
+                self._update_index_with_issues(issues_to_update, repo_name_for_index, upsert=True)
+            else:
+                logging.info('No new issues to update')
+
+    async def run(self):
+        repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1])
+        issue_main = self.git_provider.repo_obj.get_issue(original_issue_number)
+        issue_str, comments, number = self._process_issue(issue_main)
+        openai.api_key = get_settings().openai.key
+
+        res = openai.Embedding.create(input=[issue_str], engine=MODEL)
+        embeds = [record['embedding'] for record in res['data']]
+        pinecone_index = pinecone.Index(index_name=self.index_name)
+        res = pinecone_index.query(embeds[0],
+                                   top_k=5,
+                                   filter={"repo": self.repo_name_for_index},
+                                   include_metadata=True).to_dict()
+        relevant_issues_number_list = []
+        for r in res['matches']:
+            issue_number = int(r["id"].split('.')[0].split('_')[-1])
+            if original_issue_number == issue_number:
+                continue
+            if issue_number not in relevant_issues_number_list:
+                relevant_issues_number_list.append(issue_number)
+
+        similar_issues_str = "Similar Issues:\n\n"
+        for i, issue_number_similar in enumerate(relevant_issues_number_list):
+            issue = self.git_provider.repo_obj.get_issue(issue_number_similar)
+            title = issue.title
+            url = issue.html_url
+            similar_issues_str += f"{i + 1}. [{title}]({url})\n\n"
+        if get_settings().config.publish_output:
+            response = issue_main.create_comment(similar_issues_str)
+        logging.info(similar_issues_str)
+
+    def _process_issue(self, issue):
+        header = issue.title
+        body = issue.body
+        number = issue.number
+        if get_settings().pr_similar_issue.skip_comments:
+            comments = []
+        else:
+            comments = list(issue.get_comments())
+        issue_str = f"Issue Header: \"{header}\"\n\nIssue Body:\n{body}"
+        return issue_str, comments, number
+
+    def _update_index_with_issues(self, issues_list, repo_name_for_index, upsert=False):
+        logging.info('Processing issues...')
+        corpus = Corpus()
+        example_issue_record = Record(
+            id=f"example_issue_{repo_name_for_index}",
+            text="example_issue",
+            metadata=Metadata(repo=repo_name_for_index)
+        )
+        corpus.append(example_issue_record)
+
+        counter = 0
+        for issue in issues_list:
+            if issue.pull_request:
+                continue
+
+            counter += 1
+            if counter % 100 == 0:
+                logging.info(f"Scanned {counter} issues")
+            if counter >= self.max_issues_to_scan:
+                logging.info(f"Scanned {self.max_issues_to_scan} issues, stopping")
+                break
+
+            issue_str, comments, number = self._process_issue(issue)
+            issue_key = f"issue_{number}"
+            username = issue.user.login
+            created_at = str(issue.created_at)
+            if len(issue_str) < 8000 or \
+                    self.token_handler.count_tokens(issue_str) < MAX_TOKENS[MODEL]:  # fast reject first
+                issue_record = Record(
+                    id=issue_key + "." + "issue",
+                    text=issue_str,
+                    metadata=Metadata(repo=repo_name_for_index,
+                                      username=username,
+                                      created_at=created_at,
+                                      level=IssueLevel.ISSUE)
+                )
+                corpus.append(issue_record)
+                if comments:
+                    for j, comment in enumerate(comments):
+                        comment_body = comment.body
+                        num_words_comment = len(comment_body.split())
+                        if num_words_comment < 10 or not isinstance(comment_body, str):
+                            continue
+
+                        if len(comment_body) < 8000 or \
+                                self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]:
+                            comment_record = Record(
+                                id=issue_key + ".comment_" + str(j + 1),
+                                text=comment_body,
+                                metadata=Metadata(repo=repo_name_for_index,
+                                                  username=username,  # use issue username for all comments
+                                                  created_at=created_at,
+                                                  level=IssueLevel.COMMENT)
+                            )
+                            corpus.append(comment_record)
+        df = pd.DataFrame(corpus.dict()["documents"])
+        logging.info('Done')
+
+        logging.info('Embedding...')
+        openai.api_key = get_settings().openai.key
+        list_to_encode = list(df["text"].values)
+        try:
+            res = openai.Embedding.create(input=list_to_encode, engine=MODEL)
+            embeds = [record['embedding'] for record in res['data']]
+        except:
+            embeds = []
+            logging.error('Failed to embed entire list, embedding one by one...')
+            for i, text in enumerate(list_to_encode):
+                try:
+                    res = openai.Embedding.create(input=[text], engine=MODEL)
+                    embeds.append(res['data'][0]['embedding'])
+                except:
+                    embeds.append([0] * 1536)
+        df["values"] = embeds
+        meta = DatasetMetadata.empty()
+        meta.dense_model.dimension = len(embeds[0])
+        ds = Dataset.from_pandas(df, meta)
+        logging.info('Done')
+
+        api_key = get_settings().pinecone.api_key
+        environment = get_settings().pinecone.environment
+        if not upsert:
+            logging.info('Creating index from scratch...')
+            ds.to_pinecone_index(self.index_name, api_key=api_key, environment=environment)
+        else:
+            logging.info('Upserting index...')
+            namespace = ""
+            batch_size: int = 100
+            concurrency: int = 10
+            pinecone.init(api_key=api_key, environment=environment)
+            ds._upsert_to_index(self.index_name, namespace, batch_size, concurrency)
+        logging.info('Done')
+
+
+class IssueLevel(str, Enum):
+    ISSUE = "issue"
+    COMMENT = "comment"
+
+
+class Metadata(BaseModel):
+    repo: str
+    username: str = Field(default="@codium")
+    created_at: str = Field(default="01-01-1970 00:00:00.00000")
+    level: IssueLevel = Field(default=IssueLevel.ISSUE)
+
+    class Config:
+        use_enum_values = True
+
+
+class Record(BaseModel):
+    id: str
+    text: str
+    metadata: Metadata
+
+
+class Corpus(BaseModel):
+    documents: List[Record] = Field(default=[])
+
+    def append(self, r: Record):
+        self.documents.append(r)
--- a/requirements.txt
+++ b/requirements.txt
@ -7,15 +7,17 @@ Jinja2==3.1.2
 tiktoken==0.4.0
 uvicorn==0.22.0
 python-gitlab==3.15.0
-pytest~=7.4.0
-aiohttp~=3.8.4
+pytest==7.4.0
+aiohttp==3.8.4
 atlassian-python-api==3.39.0
-GitPython~=3.1.32
+GitPython==3.1.32
 PyYAML==6.0
 starlette-context==0.3.6
-litellm~=0.1.538
-boto3~=1.28.25
+litellm~=0.1.574
+boto3==1.28.25
 google-cloud-storage==2.10.0
 ujson==5.8.0
 azure-devops==7.1.0b3
-msrest==0.7.1
+msrest==0.7.1
+pinecone-client
+pinecone-datasets @ git+https://github.com/mrT23/pinecone-datasets.git@main