Added help_docs feature.

This commit is contained in:
Eyal Sharon
2025-03-20 16:32:16 +02:00
parent b087458e33
commit 5e7e353670
12 changed files with 599 additions and 2 deletions

View File

@ -13,6 +13,7 @@ from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions
from pr_agent.tools.pr_config import PRConfig
from pr_agent.tools.pr_description import PRDescription
from pr_agent.tools.pr_generate_labels import PRGenerateLabels
from pr_agent.tools.pr_help_docs import PRHelpDocs
from pr_agent.tools.pr_help_message import PRHelpMessage
from pr_agent.tools.pr_line_questions import PR_LineQuestions
from pr_agent.tools.pr_questions import PRQuestions
@ -39,6 +40,7 @@ command2class = {
"similar_issue": PRSimilarIssue,
"add_docs": PRAddDocs,
"generate_labels": PRGenerateLabels,
"help_docs": PRHelpDocs,
}
commands = list(command2class.keys())

View File

@ -22,6 +22,7 @@ def set_parser():
- cli.py --pr_url=... ask "write me a poem about this PR"
- cli.py --pr_url=... reflect
- cli.py --issue_url=... similar_issue
- cli.py --pr_url/--issue_url= help_docs [<asked question>]
Supported commands:
- review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement.
@ -40,6 +41,8 @@ def set_parser():
- add_docs
- generate_labels
- help_docs - Ask a question, from either an issue or PR context, on a given repo (current context or a different one)
Configuration:

View File

@ -28,6 +28,7 @@ global_settings = Dynaconf(
"settings/pr_add_docs.toml",
"settings/custom_labels.toml",
"settings/pr_help_prompts.toml",
"settings/pr_help_docs_prompts.toml",
"settings/.secrets.toml",
"settings_prod/.secrets.toml",
]]

View File

@ -67,6 +67,33 @@ class BitbucketProvider(GitProvider):
except Exception:
return ""
def get_git_repo_url(self, pr_url: str=None) -> str: #bitbucket does not support issue url, so ignore param
try:
parsed_url = urlparse(self.pr_url)
return f"{parsed_url.scheme}://{parsed_url.netloc}/{self.workspace_slug}/{self.repo_slug}.git"
except Exception as e:
get_logger().exception(f"url is not a valid merge requests url: {self.pr_url}")
return ""
def get_canonical_url_parts(self, repo_git_url:str=None, desired_branch:str=None) -> Tuple[str, str]:
scheme_and_netloc = None
if repo_git_url:
parsed_git_url = urlparse(repo_git_url)
scheme_and_netloc = parsed_git_url.scheme + "://" + parsed_git_url.netloc
repo_path = parsed_git_url.path.split('.git')[0][1:] #/<workspace>/<repo>.git -> <workspace>/<repo>
if repo_path.count('/') != 1:
get_logger().error(f"repo_git_url is not a valid git repo url: {repo_git_url}")
return ("", "")
workspace_name, project_name = repo_path.split('/')
else:
parsed_pr_url = urlparse(self.pr_url)
scheme_and_netloc = parsed_pr_url.scheme + "://" + parsed_pr_url.netloc
workspace_name, project_name = (self.workspace_slug, self.repo_slug)
prefix = f"{scheme_and_netloc}/{workspace_name}/{project_name}/src/{desired_branch}"
suffix = "" #None
return (prefix, suffix)
def publish_code_suggestions(self, code_suggestions: list) -> bool:
"""
Publishes code suggestions as comments on the PR.

View File

@ -138,6 +138,31 @@ class BitbucketServerProvider(GitProvider):
return False
return True
def get_git_repo_url(self, pr_url: str=None) -> str: #bitbucket server does not support issue url, so ignore param
try:
parsed_url = urlparse(self.pr_url)
return f"{parsed_url.scheme}://{parsed_url.netloc}/scm/{self.workspace_slug.lower()}/{self.repo_slug.lower()}.git"
except Exception as e:
get_logger().exception(f"url is not a valid merge requests url: {self.pr_url}")
return ""
def get_canonical_url_parts(self, repo_git_url:str=None, desired_branch:str=None) -> Tuple[str, str]:
workspace_name = None
project_name = None
if not repo_git_url:
workspace_name = self.workspace_slug
project_name = self.repo_slug
else:
repo_path = repo_git_url.split('.git')[0].split('scm/')[-1]
if repo_path.count('/') == 1: # Has to have the form <workspace>/<repo>
workspace_name, project_name = repo_path.split('/')
if not workspace_name or not project_name:
get_logger().error(f"workspace_name or project_name not found in context, either git url: {repo_git_url} or uninitialized workspace/project.")
return ("", "")
prefix = f"{self.bitbucket_server_url}/projects/{workspace_name}/repos/{project_name}/browse"
suffix = f"?at=refs%2Fheads%2F{desired_branch}"
return (prefix, suffix)
def set_pr(self, pr_url: str):
self.workspace_slug, self.repo_slug, self.pr_num = self._parse_pr_url(pr_url)
self.pr = self._get_pr()

View File

@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
# enum EDIT_TYPE (ADDED, DELETED, MODIFIED, RENAMED)
from typing import Optional
from typing import Optional, Tuple
from pr_agent.algo.types import FilePatchInfo
from pr_agent.algo.utils import Range, process_description
@ -14,6 +14,19 @@ class GitProvider(ABC):
def is_supported(self, capability: str) -> bool:
pass
#Given a url (issues or PR/MR) - get the .git repo url to which they belong. Needs to be implemented by the provider.
def get_git_repo_url(self, issues_or_pr_url: str) -> str:
get_logger().warning("Not implemented! Returning empty url")
return ""
# Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo. Needs to be implemented by the provider.
# For example: For a git: https://git_provider.com/MY_PROJECT/MY_REPO.git then it should return ('https://git_provider.com/projects/MY_PROJECT/repos/MY_REPO', '?=<SOME HEADER>')
# so that to properly view the file: docs/readme.md -> <PREFIX>/docs/readme.md<SUFFIX> -> https://git_provider.com/projects/MY_PROJECT/repos/MY_REPO/docs/readme.md?=<SOME HEADER>)
def get_canonical_url_parts(self, repo_git_url:str, desired_branch:str) -> Tuple[str, str]:
get_logger().warning("Not implemented! Returning empty prefix and suffix")
return ("", "")
@abstractmethod
def get_files(self) -> list:
pass

View File

@ -63,6 +63,53 @@ class GithubProvider(GitProvider):
def is_supported(self, capability: str) -> bool:
return True
def _get_owner_and_repo_path(self, given_url: str) -> str:
try:
repo_path = None
if 'issues' in given_url:
repo_path, _ = self._parse_issue_url(given_url)
elif 'pull' in given_url:
repo_path, _ = self._parse_pr_url(given_url)
elif given_url.endswith('.git'):
parsed_url = urlparse(given_url)
repo_path = (parsed_url.path.split('.git')[0])[1:] # /<owner>/<repo>.git -> <owner>/<repo>
if not repo_path:
get_logger().error(f"url is neither an issues url nor a pr url nor a valid git url: {given_url}. Returning empty result.")
return ""
return repo_path
except Exception as e:
get_logger().exception(f"unable to parse url: {given_url}. Returning empty result.")
return ""
def get_git_repo_url(self, issues_or_pr_url: str) -> str:
repo_path = self._get_owner_and_repo_path(issues_or_pr_url)
return f"{issues_or_pr_url.split(repo_path)[0]}{repo_path}.git"
def get_canonical_url_parts(self, repo_git_url:str, desired_branch:str) -> Tuple[str, str]:
owner = None
repo = None
scheme_and_netloc = None
if repo_git_url: #If user provided an external git url, which may be different than what this provider was initialized with, we cannot use self.repo
repo_path = self._get_owner_and_repo_path(repo_git_url)
parsed_git_url = urlparse(repo_git_url)
scheme_and_netloc = parsed_git_url.scheme + "://" + parsed_git_url.netloc
if repo_path.count('/') == 1: #Has to have the form <owner>/<repo>
owner, repo = repo_path.split('/')
else:
get_logger().error(f"Invalid repo_path: {repo_path} from repo_git_url: {repo_git_url}")
return ("", "")
if (not owner or not repo) and self.repo: #"else" - User did not provide an external git url, use self.repo object:
owner, repo = self.repo.split('/')
scheme_and_netloc = self.base_url_html
if not any([scheme_and_netloc, owner, repo]): #"else": Not invoked from a PR context,but no provided git url for context
get_logger().error(f"Unable to get canonical url parts since missing context (PR or explicit git url)")
return ("", "")
prefix = f"{scheme_and_netloc}/{owner}/{repo}/blob/{desired_branch}"
suffix = "" # github does not add a suffix
return (prefix, suffix)
def get_pr_url(self) -> str:
return self.pr.html_url

View File

@ -57,6 +57,38 @@ class GitLabProvider(GitProvider):
return False
return True
def _get_project_path_from_pr_or_issue_url(self, pr_or_issue_url: str) -> str:
repo_project_path = None
if 'issues' in pr_or_issue_url:
#replace 'issues' with 'merge_requests', since gitlab provider does not support issue urls, just to get the git repo url:
pr_or_issue_url = pr_or_issue_url.replace('issues', 'merge_requests')
if 'merge_requests' in pr_or_issue_url:
repo_project_path, _ = self._parse_merge_request_url(pr_or_issue_url)
if not repo_project_path:
get_logger().error(f"url is not a valid merge requests url: {pr_or_issue_url}")
return ""
return repo_project_path
def get_git_repo_url(self, issues_or_pr_url: str) -> str:
provider_url = issues_or_pr_url
repo_path = self._get_project_path_from_pr_or_issue_url(issues_or_pr_url)
if not repo_path:
return ""
return f"{provider_url.split(repo_path)[0]}{repo_path}.git"
def get_canonical_url_parts(self, repo_git_url:str=None, desired_branch:str=None) -> Tuple[str, str]:
repo_path = ""
if not repo_git_url and not self.pr_url:
get_logger().error("Cannot get canonical URL parts: missing either context PR URL or a repo GIT URL")
return ("", "")
if not repo_git_url: #Use PR url as context
repo_path = self._get_project_path_from_pr_or_issue_url(self.pr_url)
else: #Use repo git url
repo_path = repo_git_url.split('.git')[0].split('.com/')[-1]
prefix = f"{self.gitlab_url}/{repo_path}/-/blob/{desired_branch}"
suffix = "?ref_type=heads" # gitlab cloud adds this suffix. gitlab server does not, but it is harmless.
return (prefix, suffix)
@property
def pr(self):
'''The GitLab terminology is merge request (MR) instead of pull request (PR)'''

View File

@ -212,6 +212,12 @@ num_retrieved_snippets=5
[pr_config] # /config #
[pr_help_docs]
repo_url = "" #If not overwritten, will use the repo from where the context came from (issue or PR)
docs_path = "docs"
exclude_root_readme = false
supported_doc_exts = [".md", ".mdx", ".rst"]
[github]
# The type of deployment to create. Valid values are 'app' or 'user'.
deployment_type = "user"

View File

@ -0,0 +1,77 @@
[pr_help_docs_prompts]
system="""You are Doc-helper, a language model designed to answer questions about a documentation website for a given repository.
You will receive a question, a repository url and the full documentation content for that repository (either as markdown or as restructred text).
Your goal is to provide the best answer to the question using the documentation provided.
Additional instructions:
- Be short and concise in your answers. Give examples if needed.
- Answer only questions that are related to the documentation website content. If the question is completely unrelated to the documentation, return an empty response.
The output must be a YAML object equivalent to type $DocHelper, according to the following Pydantic definitions:
=====
class relevant_section(BaseModel):
file_name: str = Field(description="The name of the relevant file")
relevant_section_header_string: str = Field(description="The exact text of the relevant markdown/restructured text section heading from the relevant file (starting with '#', '##', etc.). Return empty string if the entire file is the relevant section, or if the relevant section has no heading")
class DocHelper(BaseModel):
user_question: str = Field(description="The user's question")
response: str = Field(description="The response to the user's question")
relevant_sections: List[relevant_section] = Field(description="A list of the relevant markdown/restructured text sections in the documentation that answer the user's question, ordered by importance (most relevant first)")
question_is_relevant: int = Field(description="Return 1 if the question is somewhat relevant to documenation. 0 - otherwise")
=====
Example output:
```yaml
user_question: |
...
response: |
...
relevant_sections:
- file_name: "src/file1.py"
relevant_section_header_string: |
...
- ...
question_is_relevant: |
1
"""
user="""\
Documentation url: '{{ docs_url| trim }}'
-----
User's Question:
=====
{{ question|trim }}
=====
Documentation website content:
=====
{{ snippets|trim }}
=====
Reminder: The output must be a YAML object equivalent to type $DocHelper, similar to the following example output:
=====
Example output:
```yaml
user_question: |
...
response: |
...
relevant_sections:
- file_name: "src/file1.py"
relevant_section_header_string: |
...
- ...
question_is_relevant: |
1
=====
Response (should be a valid YAML, and nothing else).
```yaml
"""

View File

@ -0,0 +1,365 @@
import copy
from functools import partial
from jinja2 import Environment, StrictUndefined
import math
import os
import re
from tempfile import TemporaryDirectory
from typing import Dict, List, Optional, Tuple
from pr_agent.algo import MAX_TOKENS
from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler
from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler
from pr_agent.algo.pr_processing import retry_with_fallback_models
from pr_agent.algo.token_handler import TokenHandler
from pr_agent.algo.utils import clip_tokens, get_max_tokens, load_yaml, ModelType
from pr_agent.config_loader import get_settings
from pr_agent.git_providers import get_git_provider_with_context
from pr_agent.log import get_logger
#Common code that can be called from similar tools:
def modify_answer_section(ai_response: str) -> str | None:
# Gets the model's answer and relevant sources section, repacing the heading of the answer section with:
# :bulb: Auto-generated documentation-based answer:
"""
For example: The following input:
### Question: \nThe following general issue was asked by a user: Title: How does one request to re-review a PR? More Info: I cannot seem to find to do this.
### Answer:\nAccording to the documentation, one needs to invoke the command: /review
#### Relevant Sources...
Should become:
### :bulb: Auto-generated documentation-based answer:\n
According to the documentation, one needs to invoke the command: /review
#### Relevant Sources...
"""
model_answer_and_relevant_sections_in_response \
= _extract_model_answer_and_relevant_sources(ai_response)
if model_answer_and_relevant_sections_in_response is not None:
cleaned_question_with_answer = "### :bulb: Auto-generated documentation-based answer:\n"
cleaned_question_with_answer += model_answer_and_relevant_sections_in_response
return cleaned_question_with_answer
get_logger().warning(f"Either no answer section found, or that section is malformed: {ai_response}")
return None
def _extract_model_answer_and_relevant_sources(ai_response: str) -> str | None:
# It is assumed that the input contains several sections with leading "### ",
# where the answer is the last one of them having the format: "### Answer:\n"), since the model returns the answer
# AFTER the user question. By splitting using the string: "### Answer:\n" and grabbing the last part,
# the model answer is guaranteed to be in that last part, provided it is followed by a "#### Relevant Sources:\n\n".
# (for more details, see here: https://github.com/Codium-ai/pr-agent-pro/blob/main/pr_agent/tools/pr_help_message.py#L173)
"""
For example:
### Question: \nHow does one request to re-review a PR?\n\n
### Answer:\nAccording to the documentation, one needs to invoke the command: /review\n\n
#### Relevant Sources:\n\n...
The answer part is: "According to the documentation, one needs to invoke the command: /review\n\n"
followed by "Relevant Sources:\n\n".
"""
if "### Answer:\n" in ai_response:
model_answer_and_relevant_sources_sections_in_response = ai_response.split("### Answer:\n")[-1]
# Split such part by "Relevant Sources" section to contain only the model answer:
if "#### Relevant Sources:\n\n" in model_answer_and_relevant_sources_sections_in_response:
model_answer_section_in_response \
= model_answer_and_relevant_sources_sections_in_response.split("#### Relevant Sources:\n\n")[0]
get_logger().info(f"Found model answer: {model_answer_section_in_response}")
return model_answer_and_relevant_sources_sections_in_response \
if len(model_answer_section_in_response) > 0 else None
get_logger().warning(f"Either no answer section found, or that section is malformed: {ai_response}")
return None
def get_maximal_text_input_length_for_token_count_estimation():
model = get_settings().config.model
if 'claude-3-7-sonnet' in model.lower():
return 900000 #Claude API for token estimation allows maximal text input of 900K chars
return math.inf #Otherwise, no known limitation on input text just for token estimation
# Load documentation files to memory, decorating them with a header to mark where each file begins,
# as to help the LLM to give a better answer.
def aggregate_documentation_files_for_prompt_contents(base_path: str, doc_files: List[str]) -> Optional[str]:
docs_prompt = ""
for file in doc_files:
try:
with open(file, 'r', encoding='utf-8') as f:
content = f.read()
# Skip files with no text content
if not re.search(r'[a-zA-Z]', content):
continue
file_path = str(file).replace(str(base_path), '')
docs_prompt += f"\n==file name==\n\n{file_path}\n\n==file content==\n\n{content.strip()}\n=========\n\n"
except Exception as e:
get_logger().warning(f"Error while reading the file {file}: {e}")
continue
if not docs_prompt:
get_logger().error("Couldn't find any usable documentation files. Returning None.")
return None
return docs_prompt
def format_markdown_q_and_a_response(question_str: str, response_str: str, relevant_sections: List[Dict[str, str]],
supported_suffixes: List[str], base_url_prefix: str, base_url_suffix: str="") -> str:
answer_str = ""
answer_str += f"### Question: \n{question_str}\n\n"
answer_str += f"### Answer:\n{response_str.strip()}\n\n"
answer_str += f"#### Relevant Sources:\n\n"
for section in relevant_sections:
file = section.get('file_name').strip()
ext = [suffix for suffix in supported_suffixes if file.endswith(suffix)]
if not ext:
get_logger().warning(f"Unsupported file extension: {file}")
continue
if str(section['relevant_section_header_string']).strip():
markdown_header = format_markdown_header(section['relevant_section_header_string'])
if base_url_prefix:
answer_str += f"> - {base_url_prefix}{file}{base_url_suffix}#{markdown_header}\n"
else:
answer_str += f"> - {base_url_prefix}{file}{base_url_suffix}\n"
return answer_str
def format_markdown_header(header: str) -> str:
try:
# First, strip common characters from both ends
cleaned = header.strip('# 💎\n')
# Define all characters to be removed/replaced in a single pass
replacements = {
"'": '',
"`": '',
'(': '',
')': '',
',': '',
'.': '',
'?': '',
'!': '',
' ': '-'
}
# Compile regex pattern for characters to remove
pattern = re.compile('|'.join(map(re.escape, replacements.keys())))
# Perform replacements in a single pass and convert to lowercase
return pattern.sub(lambda m: replacements[m.group()], cleaned).lower()
except Exception:
get_logger().exception(f"Error while formatting markdown header", artifacts={'header': header})
return ""
def clean_markdown_content(content: str) -> str:
"""
Remove hidden comments and unnecessary elements from markdown content to reduce size.
Args:
content: The original markdown content
Returns:
Cleaned markdown content
"""
# Remove HTML comments
content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
# Remove frontmatter (YAML between --- or +++ delimiters)
content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL)
content = re.sub(r'^\+\+\+\s*\n.*?\n\+\+\+\s*\n', '', content, flags=re.DOTALL)
# Remove excessive blank lines (more than 2 consecutive)
content = re.sub(r'\n{3,}', '\n\n', content)
# Remove HTML tags that are often used for styling only
content = re.sub(r'<div.*?>|</div>|<span.*?>|</span>', '', content, flags=re.DOTALL)
# Remove image alt text which can be verbose
content = re.sub(r'!\[(.*?)\]', '![]', content)
# Remove images completely
content = re.sub(r'!\[.*?\]\(.*?\)', '', content)
# Remove simple HTML tags but preserve content between them
content = re.sub(r'<(?!table|tr|td|th|thead|tbody)([a-zA-Z][a-zA-Z0-9]*)[^>]*>(.*?)</\1>',
r'\2', content, flags=re.DOTALL)
return content.strip()
class PredictionPreparator:
def __init__(self, ai_handler, vars, system_prompt, user_prompt):
self.ai_handler = ai_handler
variables = copy.deepcopy(vars)
environment = Environment(undefined=StrictUndefined)
self.system_prompt = environment.from_string(system_prompt).render(variables)
self.user_prompt = environment.from_string(user_prompt).render(variables)
async def __call__(self, model: str) -> str:
try:
response, finish_reason = await self.ai_handler.chat_completion(
model=model, temperature=get_settings().config.temperature, system=self.system_prompt, user=self.user_prompt)
return response
except Exception as e:
get_logger().error(f"Error while preparing prediction: {e}")
return ""
class PRHelpDocs(object):
def __init__(self, ctx_url, ai_handler:partial[BaseAiHandler,] = LiteLLMAIHandler, args: Tuple[str]=None, return_as_string: bool=False):
self.ctx_url = ctx_url
self.question = args[0] if args else None
self.return_as_string = return_as_string
self.repo_url_given_explicitly = True
self.repo_url = get_settings()['PR_HELP_DOCS.REPO_URL']
self.include_root_readme_file = not(get_settings()['PR_HELP_DOCS.EXCLUDE_ROOT_README'])
self.supported_doc_exts = get_settings()['PR_HELP_DOCS.SUPPORTED_DOC_EXTS']
self.docs_path = get_settings()['PR_HELP_DOCS.DOCS_PATH']
retrieved_settings = [self.include_root_readme_file, self.supported_doc_exts, self.docs_path]
if any([setting is None for setting in retrieved_settings]):
raise Exception(f"One of the settings is invalid: {retrieved_settings}")
self.git_provider = get_git_provider_with_context(ctx_url)
if not self.git_provider:
raise Exception(f"No git provider found at {ctx_url}")
if not self.repo_url:
self.repo_url_given_explicitly = False
get_logger().debug(f"No explicit repo url provided, deducing it from type: {self.git_provider.__class__.__name__} "
f"context url: {self.ctx_url}")
self.repo_url = self.git_provider.get_git_repo_url(self.ctx_url)
get_logger().debug(f"deduced repo url: {self.repo_url}")
try: #Try to get the same branch in case triggered from a PR:
self.repo_desired_branch = self.git_provider.get_pr_branch()
except: #Otherwise (such as in issues)
self.repo_desired_branch = get_settings()['PR_HELP_DOCS.REPO_DEFAULT_BRANCH']
finally:
get_logger().debug(f"repo_desired_branch: {self.repo_desired_branch}")
self.ai_handler = ai_handler()
self.vars = {
"docs_url": self.repo_url,
"question": self.question,
"snippets": "",
}
self.token_handler = TokenHandler(None,
self.vars,
get_settings().pr_help_docs_prompts.system,
get_settings().pr_help_docs_prompts.user)
async def run(self):
if not self.question:
get_logger().warning('No question provided. Will do nothing.')
return None
try:
# Clone the repository and gather relevant documentation files.
docs_prompt = None
with TemporaryDirectory() as tmp_dir:
get_logger().debug(f"About to clone repository: {self.repo_url} to temporary directory: {tmp_dir}...")
returned_cloned_repo_root = self.git_provider.clone(self.repo_url, tmp_dir, remove_dest_folder=False)
if not returned_cloned_repo_root:
raise Exception(f"Failed to clone {self.repo_url} to {tmp_dir}")
get_logger().debug(f"About to gather relevant documentation files...")
doc_files = []
if self.include_root_readme_file:
for root, _, files in os.walk(returned_cloned_repo_root.path):
# Only look at files in the root directory, not subdirectories
if root == returned_cloned_repo_root.path:
for file in files:
if file.lower().startswith("readme."):
doc_files.append(os.path.join(root, file))
abs_docs_path = os.path.join(returned_cloned_repo_root.path, self.docs_path)
if os.path.exists(abs_docs_path):
doc_files.extend(self._find_all_document_files_matching_exts(abs_docs_path,
ignore_readme=(self.docs_path=='.')))
if not doc_files:
get_logger().warning(f"No documentation files found matching file extensions: "
f"{self.supported_doc_exts} under repo: {self.repo_url} path: {self.docs_path}")
return None
get_logger().info(f'Answering a question inside context {self.ctx_url} for repo: {self.repo_url}'
f' using the following documentation files: ', artifacts={'doc_files': doc_files})
docs_prompt = aggregate_documentation_files_for_prompt_contents(returned_cloned_repo_root.path, doc_files)
if not docs_prompt:
get_logger().warning(f"Error reading one of the documentation files. Returning with no result...")
return None
docs_prompt_to_send_to_model = docs_prompt
# Estimate how many tokens will be needed. Trim in case of exceeding limit.
# Firstly, check if text needs to be trimmed, as some models fail to return the estimated token count if the input text is too long.
max_allowed_txt_input = get_maximal_text_input_length_for_token_count_estimation()
if len(docs_prompt_to_send_to_model) >= max_allowed_txt_input:
get_logger().warning(f"Text length: {len(docs_prompt_to_send_to_model)} exceeds the current returned limit of {max_allowed_txt_input} just for token count estimation. Trimming the text...")
docs_prompt_to_send_to_model = docs_prompt_to_send_to_model[:max_allowed_txt_input]
# Then, count the tokens in the prompt. If the count exceeds the limit, trim the text.
token_count = self.token_handler.count_tokens(docs_prompt_to_send_to_model, force_accurate=True)
get_logger().debug(f"Estimated token count of documentation to send to model: {token_count}")
model = get_settings().config.model
if model in MAX_TOKENS:
max_tokens_full = MAX_TOKENS[model] # note - here we take the actual max tokens, without any reductions. we do aim to get the full documentation website in the prompt
else:
max_tokens_full = get_max_tokens(model)
delta_output = 5000 #Elbow room to reduce chance of exceeding token limit or model paying less attention to prompt guidelines.
if token_count > max_tokens_full - delta_output:
docs_prompt_to_send_to_model = clean_markdown_content(docs_prompt_to_send_to_model) #Reduce unnecessary text/images/etc.
get_logger().info(f"Token count {token_count} exceeds the limit {max_tokens_full - delta_output}. Attempting to clip text to fit within the limit...")
docs_prompt_to_send_to_model = clip_tokens(docs_prompt_to_send_to_model, max_tokens_full - delta_output,
num_input_tokens=token_count)
self.vars['snippets'] = docs_prompt_to_send_to_model.strip()
# Run the AI model and extract sections from its response
response = await retry_with_fallback_models(PredictionPreparator(self.ai_handler, self.vars,
get_settings().pr_help_docs_prompts.system,
get_settings().pr_help_docs_prompts.user),
model_type=ModelType.REGULAR)
response_yaml = load_yaml(response)
if not response_yaml:
get_logger().exception("Failed to parse the AI response.", artifacts={'response': response})
raise Exception(f"Failed to parse the AI response.")
response_str = response_yaml.get('response')
relevant_sections = response_yaml.get('relevant_sections')
if not response_str or not relevant_sections:
get_logger().exception("Failed to extract response/relevant sections.",
artifacts={'response_str': response_str, 'relevant_sections': relevant_sections})
raise Exception(f"Failed to extract response/relevant sections.")
# Format the response as markdown
canonical_url_prefix, canonical_url_suffix = self.git_provider.get_canonical_url_parts(repo_git_url=self.repo_url if self.repo_url_given_explicitly else None,
desired_branch=self.repo_desired_branch)
answer_str = format_markdown_q_and_a_response(self.question, response_str, relevant_sections, self.supported_doc_exts, canonical_url_prefix, canonical_url_suffix)
if answer_str:
#Remove the question phrase and replace with light bulb and a heading mentioning this is an automated answer:
answer_str = modify_answer_section(answer_str)
# For PR help docs, we return the answer string instead of publishing it
if answer_str and self.return_as_string:
if int(response_yaml.get('question_is_relevant', '1')) == 0:
get_logger().warning(f"Chat help docs answer would be ignored due to an invalid question.",
artifacts={'answer_str': answer_str})
return ""
get_logger().info(f"Chat help docs answer", artifacts={'answer_str': answer_str})
return answer_str
# Publish the answer
if not answer_str or int(response_yaml.get('question_is_relevant', '1')) == 0:
get_logger().info(f"No answer found")
return ""
if get_settings().config.publish_output:
self.git_provider.publish_comment(answer_str)
else:
get_logger().info("Answer:", artifacts={'answer_str': answer_str})
except:
get_logger().exception('failed to provide answer to given user question as a result of a thrown exception (see above)')
def _find_all_document_files_matching_exts(self, abs_docs_path: str, ignore_readme=False) -> List[str]:
matching_files = []
# Ensure extensions don't have leading dots and are lowercase
dotless_extensions = [ext.lower().lstrip('.') for ext in self.supported_doc_exts]
# Walk through directory and subdirectories
for root, _, files in os.walk(abs_docs_path):
for file in files:
if ignore_readme and root == abs_docs_path and file.lower() in [f"readme.{ext}" for ext in dotless_extensions]:
continue
# Check if file has one of the specified extensions
if any(file.lower().endswith(f'.{ext}') for ext in dotless_extensions):
matching_files.append(os.path.join(root, file))
return matching_files

View File

@ -35,7 +35,6 @@ class PRHelpMessage:
self.ai_handler = ai_handler()
self.question_str = self.parse_args(args)
self.return_as_string = return_as_string
self.num_retrieved_snippets = get_settings().get('pr_help.num_retrieved_snippets', 5)
if self.question_str:
self.vars = {
"question": self.question_str,