refactor: Move clip_tokens function from pr_processing to utils module, and add tests

This commit is contained in:
mrT23
2023-11-26 08:29:47 +02:00
parent d7df4287f8
commit 9465b7b577
7 changed files with 58 additions and 37 deletions

View File

@ -10,7 +10,7 @@ from github import RateLimitExceededException
from pr_agent.algo.git_patch_processing import convert_to_hunks_with_lines_numbers, extend_patch, handle_patch_deletions
from pr_agent.algo.language_handler import sort_files_by_main_languages
from pr_agent.algo.file_filter import filter_ignored
from pr_agent.algo.token_handler import TokenHandler, get_token_encoder
from pr_agent.algo.token_handler import TokenHandler
from pr_agent.algo.utils import get_max_tokens
from pr_agent.config_loader import get_settings
from pr_agent.git_providers.git_provider import FilePatchInfo, GitProvider, EDIT_TYPE
@ -326,35 +326,6 @@ def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo],
return position, absolute_position
def clip_tokens(text: str, max_tokens: int) -> str:
"""
Clip the number of tokens in a string to a maximum number of tokens.
Args:
text (str): The string to clip.
max_tokens (int): The maximum number of tokens allowed in the string.
Returns:
str: The clipped string.
"""
if not text:
return text
try:
encoder = get_token_encoder()
num_input_tokens = len(encoder.encode(text))
if num_input_tokens <= max_tokens:
return text
num_chars = len(text)
chars_per_token = num_chars / num_input_tokens
num_output_chars = int(chars_per_token * max_tokens)
clipped_text = text[:num_output_chars]
return clipped_text
except Exception as e:
get_logger().warning(f"Failed to clip tokens: {e}")
return text
def get_pr_multi_diffs(git_provider: GitProvider,
token_handler: TokenHandler,
model: str,

View File

@ -11,6 +11,7 @@ import yaml
from starlette_context import context
from pr_agent.algo import MAX_TOKENS
from pr_agent.algo.token_handler import get_token_encoder
from pr_agent.config_loader import get_settings, global_settings
from pr_agent.log import get_logger
@ -378,3 +379,34 @@ def get_max_tokens(model):
max_tokens_model = min(settings.config.max_model_tokens, max_tokens_model)
# get_logger().debug(f"limiting max tokens to {max_tokens_model}")
return max_tokens_model
def clip_tokens(text: str, max_tokens: int, add_three_dots=True) -> str:
"""
Clip the number of tokens in a string to a maximum number of tokens.
Args:
text (str): The string to clip.
max_tokens (int): The maximum number of tokens allowed in the string.
add_three_dots (bool, optional): A boolean indicating whether to add three dots at the end of the clipped
Returns:
str: The clipped string.
"""
if not text:
return text
try:
encoder = get_token_encoder()
num_input_tokens = len(encoder.encode(text))
if num_input_tokens <= max_tokens:
return text
num_chars = len(text)
chars_per_token = num_chars / num_input_tokens
num_output_chars = int(chars_per_token * max_tokens)
clipped_text = text[:num_output_chars]
if add_three_dots:
clipped_text += "...(truncated)"
return clipped_text
except Exception as e:
get_logger().warning(f"Failed to clip tokens: {e}")
return text