diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index be44ab03..2188e8cc 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -100,7 +100,7 @@ polling_interval_seconds = 30 [pr_similar_issue] skip_comments = false force_update_dataset = false -max_issues_to_scan = 1000 +max_issues_to_scan = 500 [pinecone] # fill and place in .secrets.toml diff --git a/pr_agent/tools/pr_similar_issue.py b/pr_agent/tools/pr_similar_issue.py index 50ec4c6b..98d6a1f6 100644 --- a/pr_agent/tools/pr_similar_issue.py +++ b/pr_agent/tools/pr_similar_issue.py @@ -176,24 +176,24 @@ class PRSimilarIssue: level=IssueLevel.ISSUE) ) corpus.append(issue_record) - if comments: - for j, comment in enumerate(comments): - comment_body = comment.body - num_words_comment = len(comment_body.split()) - if num_words_comment < 10 or not isinstance(comment_body, str): - continue + if comments: + for j, comment in enumerate(comments): + comment_body = comment.body + num_words_comment = len(comment_body.split()) + if num_words_comment < 10 or not isinstance(comment_body, str): + continue - if len(issue_str) < 8000 or \ - self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: - comment_record = Record( - id=issue_key + ".comment_" + str(j + 1), - text=comment_body, - metadata=Metadata(repo=repo_name_for_index, - username=username, # use issue username for all comments - created_at=created_at, - level=IssueLevel.COMMENT) - ) - corpus.append(comment_record) + if len(comment_body) < 8000 or \ + self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: + comment_record = Record( + id=issue_key + ".comment_" + str(j + 1), + text=comment_body, + metadata=Metadata(repo=repo_name_for_index, + username=username, # use issue username for all comments + created_at=created_at, + level=IssueLevel.COMMENT) + ) + corpus.append(comment_record) df = pd.DataFrame(corpus.dict()["documents"]) logging.info('Done') @@ -205,9 +205,13 @@ class PRSimilarIssue: embeds = [record['embedding'] for record in res['data']] except: embeds = [] + logging.error('Failed to embed entire list, embedding one by one...') for i, text in enumerate(list_to_encode): - res = openai.Embedding.create(input=[text], engine=MODEL) - embeds.append(res['data'][0]['embedding']) + try: + res = openai.Embedding.create(input=[text], engine=MODEL) + embeds.append(res['data'][0]['embedding']) + except: + embeds.append([0] * 1536) df["values"] = embeds meta = DatasetMetadata.empty() meta.dense_model.dimension = len(embeds[0])