diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 8b319446..be3a461b 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -284,3 +284,25 @@ def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo], absolute_position = start2 + delta - 1 break return position, absolute_position + + +def clip_tokens(text: str, max_tokens: int) -> str: + """ + Clip the number of tokens in a string to a maximum number of tokens. + + Args: + text (str): The string to clip. + max_tokens (int): The maximum number of tokens allowed in the string. + + Returns: + str: The clipped string. + """ + # We'll estimate the number of tokens by hueristically assuming 2.5 tokens per word + words = re.finditer(r'\S+', text) + max_words = max_tokens // 2.5 + end_pos = None + for i, token in enumerate(words): + if i == max_words: + end_pos = token.start() + break + return text if end_pos is None else text[:end_pos] \ No newline at end of file diff --git a/pr_agent/tools/pr_reviewer.py b/pr_agent/tools/pr_reviewer.py index 982f5000..982f18cc 100644 --- a/pr_agent/tools/pr_reviewer.py +++ b/pr_agent/tools/pr_reviewer.py @@ -8,7 +8,7 @@ from jinja2 import Environment, StrictUndefined from pr_agent.algo.ai_handler import AiHandler from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, \ - find_line_number_of_relevant_line_in_file + find_line_number_of_relevant_line_in_file, clip_tokens from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import convert_to_markdown, try_fix_json from pr_agent.config_loader import get_settings @@ -62,6 +62,8 @@ class PRReviewer: "extra_instructions": get_settings().pr_reviewer.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages(), } + self.vars["description"] = clip_tokens(self.vars["description"], 500) + self.vars["commit_messages_str"] = clip_tokens(self.vars["commit_messages_str"], 500) self.token_handler = TokenHandler( self.git_provider.pr,