mirror of
https://github.com/qodo-ai/pr-agent.git
synced 2025-07-02 11:50:37 +08:00
Merge branch 'main' into tr/litellm_debugger
This commit is contained in:
7
.github/workflows/pr-agent-review.yaml
vendored
7
.github/workflows/pr-agent-review.yaml
vendored
@ -21,7 +21,10 @@ jobs:
|
|||||||
id: pragent
|
id: pragent
|
||||||
uses: Codium-ai/pr-agent@main
|
uses: Codium-ai/pr-agent@main
|
||||||
env:
|
env:
|
||||||
OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
|
OPENAI.KEY: ${{ secrets.OPENAI_KEY }}
|
||||||
OPENAI_ORG: ${{ secrets.OPENAI_ORG }} # optional
|
OPENAI.ORG: ${{ secrets.OPENAI_ORG }} # optional
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
PINECONE.API_KEY: ${{ secrets.PINECONE_API_KEY }}
|
||||||
|
PINECONE.ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT }}
|
||||||
|
|
||||||
|
|
||||||
|
12
Usage.md
12
Usage.md
@ -50,12 +50,12 @@ When running from your local repo (CLI), your local configuration file will be u
|
|||||||
|
|
||||||
Examples for invoking the different tools via the CLI:
|
Examples for invoking the different tools via the CLI:
|
||||||
|
|
||||||
- **Review**: `python cli.py --pr_url=<pr_url> /review`
|
- **Review**: `python cli.py --pr_url=<pr_url> review`
|
||||||
- **Describe**: `python cli.py --pr_url=<pr_url> /describe`
|
- **Describe**: `python cli.py --pr_url=<pr_url> describe`
|
||||||
- **Improve**: `python cli.py --pr_url=<pr_url> /improve`
|
- **Improve**: `python cli.py --pr_url=<pr_url> improve`
|
||||||
- **Ask**: `python cli.py --pr_url=<pr_url> /ask "Write me a poem about this PR"`
|
- **Ask**: `python cli.py --pr_url=<pr_url> ask "Write me a poem about this PR"`
|
||||||
- **Reflect**: `python cli.py --pr_url=<pr_url> /reflect`
|
- **Reflect**: `python cli.py --pr_url=<pr_url> reflect`
|
||||||
- **Update Changelog**: `python cli.py --pr_url=<pr_url> /update_changelog`
|
- **Update Changelog**: `python cli.py --pr_url=<pr_url> update_changelog`
|
||||||
|
|
||||||
`<pr_url>` is the url of the relevant PR (for example: https://github.com/Codium-ai/pr-agent/pull/50).
|
`<pr_url>` is the url of the relevant PR (for example: https://github.com/Codium-ai/pr-agent/pull/50).
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ from pr_agent.git_providers import get_git_provider
|
|||||||
from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions
|
from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions
|
||||||
from pr_agent.tools.pr_description import PRDescription
|
from pr_agent.tools.pr_description import PRDescription
|
||||||
from pr_agent.tools.pr_information_from_user import PRInformationFromUser
|
from pr_agent.tools.pr_information_from_user import PRInformationFromUser
|
||||||
|
from pr_agent.tools.pr_similar_issue import PRSimilarIssue
|
||||||
from pr_agent.tools.pr_questions import PRQuestions
|
from pr_agent.tools.pr_questions import PRQuestions
|
||||||
from pr_agent.tools.pr_reviewer import PRReviewer
|
from pr_agent.tools.pr_reviewer import PRReviewer
|
||||||
from pr_agent.tools.pr_update_changelog import PRUpdateChangelog
|
from pr_agent.tools.pr_update_changelog import PRUpdateChangelog
|
||||||
@ -30,6 +31,7 @@ command2class = {
|
|||||||
"update_changelog": PRUpdateChangelog,
|
"update_changelog": PRUpdateChangelog,
|
||||||
"config": PRConfig,
|
"config": PRConfig,
|
||||||
"settings": PRConfig,
|
"settings": PRConfig,
|
||||||
|
"similar_issue": PRSimilarIssue,
|
||||||
}
|
}
|
||||||
|
|
||||||
commands = list(command2class.keys())
|
commands = list(command2class.keys())
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
MAX_TOKENS = {
|
MAX_TOKENS = {
|
||||||
|
'text-embedding-ada-002': 8000,
|
||||||
'gpt-3.5-turbo': 4000,
|
'gpt-3.5-turbo': 4000,
|
||||||
'gpt-3.5-turbo-0613': 4000,
|
'gpt-3.5-turbo-0613': 4000,
|
||||||
'gpt-3.5-turbo-0301': 4000,
|
'gpt-3.5-turbo-0301': 4000,
|
||||||
|
@ -21,7 +21,7 @@ class TokenHandler:
|
|||||||
method.
|
method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pr, vars: dict, system, user):
|
def __init__(self, pr=None, vars: dict = {}, system="", user=""):
|
||||||
"""
|
"""
|
||||||
Initializes the TokenHandler object.
|
Initializes the TokenHandler object.
|
||||||
|
|
||||||
@ -32,7 +32,8 @@ class TokenHandler:
|
|||||||
- user: The user string.
|
- user: The user string.
|
||||||
"""
|
"""
|
||||||
self.encoder = get_token_encoder()
|
self.encoder = get_token_encoder()
|
||||||
self.prompt_tokens = self._get_system_user_tokens(pr, self.encoder, vars, system, user)
|
if pr is not None:
|
||||||
|
self.prompt_tokens = self._get_system_user_tokens(pr, self.encoder, vars, system, user)
|
||||||
|
|
||||||
def _get_system_user_tokens(self, pr, encoder, vars: dict, system, user):
|
def _get_system_user_tokens(self, pr, encoder, vars: dict, system, user):
|
||||||
"""
|
"""
|
||||||
|
@ -168,7 +168,7 @@ def fix_json_escape_char(json_message=None):
|
|||||||
Raises:
|
Raises:
|
||||||
None
|
None
|
||||||
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
result = json.loads(json_message)
|
result = json.loads(json_message)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -195,7 +195,7 @@ def convert_str_to_datetime(date_str):
|
|||||||
Example:
|
Example:
|
||||||
>>> convert_str_to_datetime('Mon, 01 Jan 2022 12:00:00 UTC')
|
>>> convert_str_to_datetime('Mon, 01 Jan 2022 12:00:00 UTC')
|
||||||
datetime.datetime(2022, 1, 1, 12, 0, 0)
|
datetime.datetime(2022, 1, 1, 12, 0, 0)
|
||||||
"""
|
"""
|
||||||
datetime_format = '%a, %d %b %Y %H:%M:%S %Z'
|
datetime_format = '%a, %d %b %Y %H:%M:%S %Z'
|
||||||
return datetime.strptime(date_str, datetime_format)
|
return datetime.strptime(date_str, datetime_format)
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@ For example:
|
|||||||
- cli.py --pr_url=... improve
|
- cli.py --pr_url=... improve
|
||||||
- cli.py --pr_url=... ask "write me a poem about this PR"
|
- cli.py --pr_url=... ask "write me a poem about this PR"
|
||||||
- cli.py --pr_url=... reflect
|
- cli.py --pr_url=... reflect
|
||||||
|
- cli.py --issue_url=... similar_issue
|
||||||
|
|
||||||
Supported commands:
|
Supported commands:
|
||||||
-review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement.
|
-review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement.
|
||||||
@ -37,14 +38,22 @@ Configuration:
|
|||||||
To edit any configuration parameter from 'configuration.toml', just add -config_path=<value>.
|
To edit any configuration parameter from 'configuration.toml', just add -config_path=<value>.
|
||||||
For example: 'python cli.py --pr_url=... review --pr_reviewer.extra_instructions="focus on the file: ..."'
|
For example: 'python cli.py --pr_url=... review --pr_reviewer.extra_instructions="focus on the file: ..."'
|
||||||
""")
|
""")
|
||||||
parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', required=True)
|
parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', default=None)
|
||||||
|
parser.add_argument('--issue_url', type=str, help='The URL of the Issue to review', default=None)
|
||||||
parser.add_argument('command', type=str, help='The', choices=commands, default='review')
|
parser.add_argument('command', type=str, help='The', choices=commands, default='review')
|
||||||
parser.add_argument('rest', nargs=argparse.REMAINDER, default=[])
|
parser.add_argument('rest', nargs=argparse.REMAINDER, default=[])
|
||||||
args = parser.parse_args(inargs)
|
args = parser.parse_args(inargs)
|
||||||
|
if not args.pr_url and not args.issue_url:
|
||||||
|
parser.print_help()
|
||||||
|
return
|
||||||
|
|
||||||
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
|
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
|
||||||
command = args.command.lower()
|
command = args.command.lower()
|
||||||
get_settings().set("CONFIG.CLI_MODE", True)
|
get_settings().set("CONFIG.CLI_MODE", True)
|
||||||
result = asyncio.run(PRAgent().handle_request(args.pr_url, command + " " + " ".join(args.rest)))
|
if args.issue_url:
|
||||||
|
result = asyncio.run(PRAgent().handle_request(args.issue_url, command + " " + " ".join(args.rest)))
|
||||||
|
else:
|
||||||
|
result = asyncio.run(PRAgent().handle_request(args.pr_url, command + " " + " ".join(args.rest)))
|
||||||
if not result:
|
if not result:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ class GithubProvider(GitProvider):
|
|||||||
self.diff_files = None
|
self.diff_files = None
|
||||||
self.git_files = None
|
self.git_files = None
|
||||||
self.incremental = incremental
|
self.incremental = incremental
|
||||||
if pr_url:
|
if pr_url and 'pull' in pr_url:
|
||||||
self.set_pr(pr_url)
|
self.set_pr(pr_url)
|
||||||
self.last_commit_id = list(self.pr.get_commits())[-1]
|
self.last_commit_id = list(self.pr.get_commits())[-1]
|
||||||
|
|
||||||
@ -309,6 +309,35 @@ class GithubProvider(GitProvider):
|
|||||||
|
|
||||||
return repo_name, pr_number
|
return repo_name, pr_number
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_issue_url(issue_url: str) -> Tuple[str, int]:
|
||||||
|
parsed_url = urlparse(issue_url)
|
||||||
|
|
||||||
|
if 'github.com' not in parsed_url.netloc:
|
||||||
|
raise ValueError("The provided URL is not a valid GitHub URL")
|
||||||
|
|
||||||
|
path_parts = parsed_url.path.strip('/').split('/')
|
||||||
|
if 'api.github.com' in parsed_url.netloc:
|
||||||
|
if len(path_parts) < 5 or path_parts[3] != 'issues':
|
||||||
|
raise ValueError("The provided URL does not appear to be a GitHub ISSUE URL")
|
||||||
|
repo_name = '/'.join(path_parts[1:3])
|
||||||
|
try:
|
||||||
|
issue_number = int(path_parts[4])
|
||||||
|
except ValueError as e:
|
||||||
|
raise ValueError("Unable to convert issue number to integer") from e
|
||||||
|
return repo_name, issue_number
|
||||||
|
|
||||||
|
if len(path_parts) < 4 or path_parts[2] != 'issues':
|
||||||
|
raise ValueError("The provided URL does not appear to be a GitHub PR issue")
|
||||||
|
|
||||||
|
repo_name = '/'.join(path_parts[:2])
|
||||||
|
try:
|
||||||
|
issue_number = int(path_parts[3])
|
||||||
|
except ValueError as e:
|
||||||
|
raise ValueError("Unable to convert issue number to integer") from e
|
||||||
|
|
||||||
|
return repo_name, issue_number
|
||||||
|
|
||||||
def _get_github_client(self):
|
def _get_github_client(self):
|
||||||
deployment_type = get_settings().get("GITHUB.DEPLOYMENT_TYPE", "user")
|
deployment_type = get_settings().get("GITHUB.DEPLOYMENT_TYPE", "user")
|
||||||
|
|
||||||
|
@ -16,6 +16,10 @@ key = "" # Acquire through https://platform.openai.com
|
|||||||
#deployment_id = "" # The deployment name you chose when you deployed the engine
|
#deployment_id = "" # The deployment name you chose when you deployed the engine
|
||||||
#fallback_deployments = [] # For each fallback model specified in configuration.toml in the [config] section, specify the appropriate deployment_id
|
#fallback_deployments = [] # For each fallback model specified in configuration.toml in the [config] section, specify the appropriate deployment_id
|
||||||
|
|
||||||
|
[pinecone]
|
||||||
|
api_key = "..."
|
||||||
|
environment = "gcp-starter"
|
||||||
|
|
||||||
[anthropic]
|
[anthropic]
|
||||||
key = "" # Optional, uncomment if you want to use Anthropic. Acquire through https://www.anthropic.com/
|
key = "" # Optional, uncomment if you want to use Anthropic. Acquire through https://www.anthropic.com/
|
||||||
|
|
||||||
|
@ -97,3 +97,13 @@ polling_interval_seconds = 30
|
|||||||
|
|
||||||
[litellm]
|
[litellm]
|
||||||
#use_client = false
|
#use_client = false
|
||||||
|
|
||||||
|
[pr_similar_issue]
|
||||||
|
skip_comments = false
|
||||||
|
force_update_dataset = false
|
||||||
|
max_issues_to_scan = 500
|
||||||
|
|
||||||
|
[pinecone]
|
||||||
|
# fill and place in .secrets.toml
|
||||||
|
#api_key = ...
|
||||||
|
# environment = "gcp-starter"
|
261
pr_agent/tools/pr_similar_issue.py
Normal file
261
pr_agent/tools/pr_similar_issue.py
Normal file
@ -0,0 +1,261 @@
|
|||||||
|
import copy
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from enum import Enum
|
||||||
|
from typing import List, Tuple
|
||||||
|
import pinecone
|
||||||
|
import openai
|
||||||
|
import pandas as pd
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from pr_agent.algo import MAX_TOKENS
|
||||||
|
from pr_agent.algo.token_handler import TokenHandler
|
||||||
|
from pr_agent.config_loader import get_settings
|
||||||
|
from pr_agent.git_providers import get_git_provider
|
||||||
|
from pinecone_datasets import Dataset, DatasetMetadata
|
||||||
|
|
||||||
|
MODEL = "text-embedding-ada-002"
|
||||||
|
|
||||||
|
|
||||||
|
class PRSimilarIssue:
|
||||||
|
def __init__(self, issue_url: str, args: list = None):
|
||||||
|
if get_settings().config.git_provider != "github":
|
||||||
|
raise Exception("Only github is supported for similar issue tool")
|
||||||
|
|
||||||
|
self.cli_mode = get_settings().CONFIG.CLI_MODE
|
||||||
|
self.max_issues_to_scan = get_settings().pr_similar_issue.max_issues_to_scan
|
||||||
|
self.issue_url = issue_url
|
||||||
|
self.git_provider = get_git_provider()()
|
||||||
|
repo_name, issue_number = self.git_provider._parse_issue_url(issue_url.split('=')[-1])
|
||||||
|
self.git_provider.repo = repo_name
|
||||||
|
self.git_provider.repo_obj = self.git_provider.github_client.get_repo(repo_name)
|
||||||
|
self.token_handler = TokenHandler()
|
||||||
|
repo_obj = self.git_provider.repo_obj
|
||||||
|
repo_name_for_index = self.repo_name_for_index = repo_obj.full_name.lower().replace('/', '-').replace('_/', '-')
|
||||||
|
index_name = self.index_name = "codium-ai-pr-agent-issues"
|
||||||
|
|
||||||
|
# assuming pinecone api key and environment are set in secrets file
|
||||||
|
try:
|
||||||
|
api_key = get_settings().pinecone.api_key
|
||||||
|
environment = get_settings().pinecone.environment
|
||||||
|
except Exception:
|
||||||
|
if not self.cli_mode:
|
||||||
|
repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1])
|
||||||
|
issue_main = self.git_provider.repo_obj.get_issue(original_issue_number)
|
||||||
|
issue_main.create_comment("Please set pinecone api key and environment in secrets file")
|
||||||
|
raise Exception("Please set pinecone api key and environment in secrets file")
|
||||||
|
|
||||||
|
# check if index exists, and if repo is already indexed
|
||||||
|
run_from_scratch = False
|
||||||
|
upsert = True
|
||||||
|
pinecone.init(api_key=api_key, environment=environment)
|
||||||
|
if not index_name in pinecone.list_indexes():
|
||||||
|
run_from_scratch = True
|
||||||
|
upsert = False
|
||||||
|
else:
|
||||||
|
if get_settings().pr_similar_issue.force_update_dataset:
|
||||||
|
upsert = True
|
||||||
|
else:
|
||||||
|
pinecone_index = pinecone.Index(index_name=index_name)
|
||||||
|
res = pinecone_index.fetch([f"example_issue_{repo_name_for_index}"]).to_dict()
|
||||||
|
if res["vectors"]:
|
||||||
|
upsert = False
|
||||||
|
|
||||||
|
if run_from_scratch or upsert: # index the entire repo
|
||||||
|
logging.info('Indexing the entire repo...')
|
||||||
|
|
||||||
|
logging.info('Getting issues...')
|
||||||
|
issues = list(repo_obj.get_issues(state='all'))
|
||||||
|
logging.info('Done')
|
||||||
|
self._update_index_with_issues(issues, repo_name_for_index, upsert=upsert)
|
||||||
|
else: # update index if needed
|
||||||
|
pinecone_index = pinecone.Index(index_name=index_name)
|
||||||
|
issues_to_update = []
|
||||||
|
issues_paginated_list = repo_obj.get_issues(state='all')
|
||||||
|
counter = 1
|
||||||
|
for issue in issues_paginated_list:
|
||||||
|
if issue.pull_request:
|
||||||
|
continue
|
||||||
|
issue_str, comments, number = self._process_issue(issue)
|
||||||
|
issue_key = f"issue_{number}"
|
||||||
|
id = issue_key + "." + "issue"
|
||||||
|
res = pinecone_index.fetch([id]).to_dict()
|
||||||
|
is_new_issue = True
|
||||||
|
for vector in res["vectors"].values():
|
||||||
|
if vector['metadata']['repo'] == repo_name_for_index:
|
||||||
|
is_new_issue = False
|
||||||
|
break
|
||||||
|
if is_new_issue:
|
||||||
|
counter += 1
|
||||||
|
issues_to_update.append(issue)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
if issues_to_update:
|
||||||
|
logging.info(f'Updating index with {counter} new issues...')
|
||||||
|
self._update_index_with_issues(issues_to_update, repo_name_for_index, upsert=True)
|
||||||
|
else:
|
||||||
|
logging.info('No new issues to update')
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1])
|
||||||
|
issue_main = self.git_provider.repo_obj.get_issue(original_issue_number)
|
||||||
|
issue_str, comments, number = self._process_issue(issue_main)
|
||||||
|
openai.api_key = get_settings().openai.key
|
||||||
|
|
||||||
|
res = openai.Embedding.create(input=[issue_str], engine=MODEL)
|
||||||
|
embeds = [record['embedding'] for record in res['data']]
|
||||||
|
pinecone_index = pinecone.Index(index_name=self.index_name)
|
||||||
|
res = pinecone_index.query(embeds[0],
|
||||||
|
top_k=5,
|
||||||
|
filter={"repo": self.repo_name_for_index},
|
||||||
|
include_metadata=True).to_dict()
|
||||||
|
relevant_issues_number_list = []
|
||||||
|
for r in res['matches']:
|
||||||
|
issue_number = int(r["id"].split('.')[0].split('_')[-1])
|
||||||
|
if original_issue_number == issue_number:
|
||||||
|
continue
|
||||||
|
if issue_number not in relevant_issues_number_list:
|
||||||
|
relevant_issues_number_list.append(issue_number)
|
||||||
|
|
||||||
|
similar_issues_str = "Similar Issues:\n\n"
|
||||||
|
for i, issue_number_similar in enumerate(relevant_issues_number_list):
|
||||||
|
issue = self.git_provider.repo_obj.get_issue(issue_number_similar)
|
||||||
|
title = issue.title
|
||||||
|
url = issue.html_url
|
||||||
|
similar_issues_str += f"{i + 1}. [{title}]({url})\n\n"
|
||||||
|
if get_settings().config.publish_output:
|
||||||
|
response = issue_main.create_comment(similar_issues_str)
|
||||||
|
logging.info(similar_issues_str)
|
||||||
|
|
||||||
|
def _process_issue(self, issue):
|
||||||
|
header = issue.title
|
||||||
|
body = issue.body
|
||||||
|
number = issue.number
|
||||||
|
if get_settings().pr_similar_issue.skip_comments:
|
||||||
|
comments = []
|
||||||
|
else:
|
||||||
|
comments = list(issue.get_comments())
|
||||||
|
issue_str = f"Issue Header: \"{header}\"\n\nIssue Body:\n{body}"
|
||||||
|
return issue_str, comments, number
|
||||||
|
|
||||||
|
def _update_index_with_issues(self, issues_list, repo_name_for_index, upsert=False):
|
||||||
|
logging.info('Processing issues...')
|
||||||
|
corpus = Corpus()
|
||||||
|
example_issue_record = Record(
|
||||||
|
id=f"example_issue_{repo_name_for_index}",
|
||||||
|
text="example_issue",
|
||||||
|
metadata=Metadata(repo=repo_name_for_index)
|
||||||
|
)
|
||||||
|
corpus.append(example_issue_record)
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
for issue in issues_list:
|
||||||
|
if issue.pull_request:
|
||||||
|
continue
|
||||||
|
|
||||||
|
counter += 1
|
||||||
|
if counter % 100 == 0:
|
||||||
|
logging.info(f"Scanned {counter} issues")
|
||||||
|
if counter >= self.max_issues_to_scan:
|
||||||
|
logging.info(f"Scanned {self.max_issues_to_scan} issues, stopping")
|
||||||
|
break
|
||||||
|
|
||||||
|
issue_str, comments, number = self._process_issue(issue)
|
||||||
|
issue_key = f"issue_{number}"
|
||||||
|
username = issue.user.login
|
||||||
|
created_at = str(issue.created_at)
|
||||||
|
if len(issue_str) < 8000 or \
|
||||||
|
self.token_handler.count_tokens(issue_str) < MAX_TOKENS[MODEL]: # fast reject first
|
||||||
|
issue_record = Record(
|
||||||
|
id=issue_key + "." + "issue",
|
||||||
|
text=issue_str,
|
||||||
|
metadata=Metadata(repo=repo_name_for_index,
|
||||||
|
username=username,
|
||||||
|
created_at=created_at,
|
||||||
|
level=IssueLevel.ISSUE)
|
||||||
|
)
|
||||||
|
corpus.append(issue_record)
|
||||||
|
if comments:
|
||||||
|
for j, comment in enumerate(comments):
|
||||||
|
comment_body = comment.body
|
||||||
|
num_words_comment = len(comment_body.split())
|
||||||
|
if num_words_comment < 10 or not isinstance(comment_body, str):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(comment_body) < 8000 or \
|
||||||
|
self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]:
|
||||||
|
comment_record = Record(
|
||||||
|
id=issue_key + ".comment_" + str(j + 1),
|
||||||
|
text=comment_body,
|
||||||
|
metadata=Metadata(repo=repo_name_for_index,
|
||||||
|
username=username, # use issue username for all comments
|
||||||
|
created_at=created_at,
|
||||||
|
level=IssueLevel.COMMENT)
|
||||||
|
)
|
||||||
|
corpus.append(comment_record)
|
||||||
|
df = pd.DataFrame(corpus.dict()["documents"])
|
||||||
|
logging.info('Done')
|
||||||
|
|
||||||
|
logging.info('Embedding...')
|
||||||
|
openai.api_key = get_settings().openai.key
|
||||||
|
list_to_encode = list(df["text"].values)
|
||||||
|
try:
|
||||||
|
res = openai.Embedding.create(input=list_to_encode, engine=MODEL)
|
||||||
|
embeds = [record['embedding'] for record in res['data']]
|
||||||
|
except:
|
||||||
|
embeds = []
|
||||||
|
logging.error('Failed to embed entire list, embedding one by one...')
|
||||||
|
for i, text in enumerate(list_to_encode):
|
||||||
|
try:
|
||||||
|
res = openai.Embedding.create(input=[text], engine=MODEL)
|
||||||
|
embeds.append(res['data'][0]['embedding'])
|
||||||
|
except:
|
||||||
|
embeds.append([0] * 1536)
|
||||||
|
df["values"] = embeds
|
||||||
|
meta = DatasetMetadata.empty()
|
||||||
|
meta.dense_model.dimension = len(embeds[0])
|
||||||
|
ds = Dataset.from_pandas(df, meta)
|
||||||
|
logging.info('Done')
|
||||||
|
|
||||||
|
api_key = get_settings().pinecone.api_key
|
||||||
|
environment = get_settings().pinecone.environment
|
||||||
|
if not upsert:
|
||||||
|
logging.info('Creating index from scratch...')
|
||||||
|
ds.to_pinecone_index(self.index_name, api_key=api_key, environment=environment)
|
||||||
|
else:
|
||||||
|
logging.info('Upserting index...')
|
||||||
|
namespace = ""
|
||||||
|
batch_size: int = 100
|
||||||
|
concurrency: int = 10
|
||||||
|
pinecone.init(api_key=api_key, environment=environment)
|
||||||
|
ds._upsert_to_index(self.index_name, namespace, batch_size, concurrency)
|
||||||
|
logging.info('Done')
|
||||||
|
|
||||||
|
|
||||||
|
class IssueLevel(str, Enum):
|
||||||
|
ISSUE = "issue"
|
||||||
|
COMMENT = "comment"
|
||||||
|
|
||||||
|
|
||||||
|
class Metadata(BaseModel):
|
||||||
|
repo: str
|
||||||
|
username: str = Field(default="@codium")
|
||||||
|
created_at: str = Field(default="01-01-1970 00:00:00.00000")
|
||||||
|
level: IssueLevel = Field(default=IssueLevel.ISSUE)
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
use_enum_values = True
|
||||||
|
|
||||||
|
|
||||||
|
class Record(BaseModel):
|
||||||
|
id: str
|
||||||
|
text: str
|
||||||
|
metadata: Metadata
|
||||||
|
|
||||||
|
|
||||||
|
class Corpus(BaseModel):
|
||||||
|
documents: List[Record] = Field(default=[])
|
||||||
|
|
||||||
|
def append(self, r: Record):
|
||||||
|
self.documents.append(r)
|
@ -7,15 +7,17 @@ Jinja2==3.1.2
|
|||||||
tiktoken==0.4.0
|
tiktoken==0.4.0
|
||||||
uvicorn==0.22.0
|
uvicorn==0.22.0
|
||||||
python-gitlab==3.15.0
|
python-gitlab==3.15.0
|
||||||
pytest~=7.4.0
|
pytest==7.4.0
|
||||||
aiohttp~=3.8.4
|
aiohttp==3.8.4
|
||||||
atlassian-python-api==3.39.0
|
atlassian-python-api==3.39.0
|
||||||
GitPython~=3.1.32
|
GitPython==3.1.32
|
||||||
PyYAML==6.0
|
PyYAML==6.0
|
||||||
starlette-context==0.3.6
|
starlette-context==0.3.6
|
||||||
litellm~=0.1.574
|
litellm~=0.1.574
|
||||||
boto3~=1.28.25
|
boto3==1.28.25
|
||||||
google-cloud-storage==2.10.0
|
google-cloud-storage==2.10.0
|
||||||
ujson==5.8.0
|
ujson==5.8.0
|
||||||
azure-devops==7.1.0b3
|
azure-devops==7.1.0b3
|
||||||
msrest==0.7.1
|
msrest==0.7.1
|
||||||
|
pinecone-client
|
||||||
|
pinecone-datasets @ git+https://github.com/mrT23/pinecone-datasets.git@main
|
Reference in New Issue
Block a user