Files
pr-agent/pr_agent/git_providers/bitbucket_provider.py
sharoneyal 14971c4f5f Add support for documentation content exceeding token limits (#1670)
* - Add support for documentation content exceeding token limits via two phase operation:
1. Ask LLM to rank headings which are most likely to contain an answer to a user question
2. Provide the corresponding files for the LLM to search for an answer.

- Refactor of help_docs to make the code more readable
- For the purpose of getting canonical path: git providers to use default branch and not the PR's source branch.
- Refactor of token counting and making it clear on when an estimate factor will be used.

* Code review changes:
1. Correctly handle exception during retry_with_fallback_models (to allow fallback model to run in case of failure)
2. Better naming for default_branch in bitbucket cloud provider
2025-04-03 11:51:26 +03:00

624 lines
27 KiB
Python

import difflib
import json
import re
from typing import Optional, Tuple
from urllib.parse import urlparse
import requests
from atlassian.bitbucket import Cloud
from starlette_context import context
from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo
from ..algo.file_filter import filter_ignored
from ..algo.language_handler import is_valid_file
from ..algo.utils import find_line_number_of_relevant_line_in_file
from ..config_loader import get_settings
from ..log import get_logger
from .git_provider import MAX_FILES_ALLOWED_FULL, GitProvider
def _gef_filename(diff):
if diff.new.path:
return diff.new.path
return diff.old.path
class BitbucketProvider(GitProvider):
def __init__(
self, pr_url: Optional[str] = None, incremental: Optional[bool] = False
):
s = requests.Session()
try:
self.bearer_token = bearer = context.get("bitbucket_bearer_token", None)
if not bearer and get_settings().get("BITBUCKET.BEARER_TOKEN", None):
self.bearer_token = bearer = get_settings().get("BITBUCKET.BEARER_TOKEN", None)
s.headers["Authorization"] = f"Bearer {bearer}"
except Exception:
self.bearer_token = get_settings().get("BITBUCKET.BEARER_TOKEN", None)
s.headers[
"Authorization"
] = f'Bearer {self.bearer_token}'
s.headers["Content-Type"] = "application/json"
self.headers = s.headers
self.bitbucket_client = Cloud(session=s)
self.max_comment_length = 31000
self.workspace_slug = None
self.repo_slug = None
self.repo = None
self.pr_num = None
self.pr = None
self.pr_url = pr_url
self.temp_comments = []
self.incremental = incremental
self.diff_files = None
self.git_files = None
if pr_url:
self.set_pr(pr_url)
self.bitbucket_comment_api_url = self.pr._BitbucketBase__data["links"]["comments"]["href"]
self.bitbucket_pull_request_api_url = self.pr._BitbucketBase__data["links"]['self']['href']
def get_repo_settings(self):
try:
url = (f"https://api.bitbucket.org/2.0/repositories/{self.workspace_slug}/{self.repo_slug}/src/"
f"{self.pr.destination_branch}/.pr_agent.toml")
response = requests.request("GET", url, headers=self.headers)
if response.status_code == 404: # not found
return ""
contents = response.text.encode('utf-8')
return contents
except Exception:
return ""
def get_git_repo_url(self, pr_url: str=None) -> str: #bitbucket does not support issue url, so ignore param
try:
parsed_url = urlparse(self.pr_url)
return f"{parsed_url.scheme}://{parsed_url.netloc}/{self.workspace_slug}/{self.repo_slug}.git"
except Exception as e:
get_logger().exception(f"url is not a valid merge requests url: {self.pr_url}")
return ""
# Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo.
# Example: git clone git clone https://bitbucket.org/codiumai/pr-agent.git and branch: main -> prefix: "https://bitbucket.org/codiumai/pr-agent/src/main", suffix: ""
# In case git url is not provided, provider will use PR context (which includes branch) to determine the prefix and suffix.
def get_canonical_url_parts(self, repo_git_url:str=None, desired_branch:str=None) -> Tuple[str, str]:
scheme_and_netloc = None
if repo_git_url:
parsed_git_url = urlparse(repo_git_url)
scheme_and_netloc = parsed_git_url.scheme + "://" + parsed_git_url.netloc
repo_path = parsed_git_url.path.split('.git')[0][1:] #/<workspace>/<repo>.git -> <workspace>/<repo>
if repo_path.count('/') != 1:
get_logger().error(f"repo_git_url is not a valid git repo url: {repo_git_url}")
return ("", "")
workspace_name, project_name = repo_path.split('/')
else:
desired_branch = self.get_repo_default_branch()
parsed_pr_url = urlparse(self.pr_url)
scheme_and_netloc = parsed_pr_url.scheme + "://" + parsed_pr_url.netloc
workspace_name, project_name = (self.workspace_slug, self.repo_slug)
prefix = f"{scheme_and_netloc}/{workspace_name}/{project_name}/src/{desired_branch}"
suffix = "" #None
return (prefix, suffix)
def publish_code_suggestions(self, code_suggestions: list) -> bool:
"""
Publishes code suggestions as comments on the PR.
"""
post_parameters_list = []
for suggestion in code_suggestions:
body = suggestion["body"]
original_suggestion = suggestion.get('original_suggestion', None) # needed for diff code
if original_suggestion:
try:
existing_code = original_suggestion['existing_code'].rstrip() + "\n"
improved_code = original_suggestion['improved_code'].rstrip() + "\n"
diff = difflib.unified_diff(existing_code.split('\n'),
improved_code.split('\n'), n=999)
patch_orig = "\n".join(diff)
patch = "\n".join(patch_orig.splitlines()[5:]).strip('\n')
diff_code = f"\n\n```diff\n{patch.rstrip()}\n```"
# replace ```suggestion ... ``` with diff_code, using regex:
body = re.sub(r'```suggestion.*?```', diff_code, body, flags=re.DOTALL)
except Exception as e:
get_logger().exception(f"Bitbucket failed to get diff code for publishing, error: {e}")
continue
relevant_file = suggestion["relevant_file"]
relevant_lines_start = suggestion["relevant_lines_start"]
relevant_lines_end = suggestion["relevant_lines_end"]
if not relevant_lines_start or relevant_lines_start == -1:
get_logger().exception(
f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}"
)
continue
if relevant_lines_end < relevant_lines_start:
get_logger().exception(
f"Failed to publish code suggestion, "
f"relevant_lines_end is {relevant_lines_end} and "
f"relevant_lines_start is {relevant_lines_start}"
)
continue
if relevant_lines_end > relevant_lines_start:
post_parameters = {
"body": body,
"path": relevant_file,
"line": relevant_lines_end,
"start_line": relevant_lines_start,
"start_side": "RIGHT",
}
else: # API is different for single line comments
post_parameters = {
"body": body,
"path": relevant_file,
"line": relevant_lines_start,
"side": "RIGHT",
}
post_parameters_list.append(post_parameters)
try:
self.publish_inline_comments(post_parameters_list)
return True
except Exception as e:
get_logger().error(f"Bitbucket failed to publish code suggestion, error: {e}")
return False
def publish_file_comments(self, file_comments: list) -> bool:
pass
def is_supported(self, capability: str) -> bool:
if capability in ['get_issue_comments', 'publish_inline_comments', 'get_labels', 'gfm_markdown',
'publish_file_comments']:
return False
return True
def set_pr(self, pr_url: str):
self.workspace_slug, self.repo_slug, self.pr_num = self._parse_pr_url(pr_url)
self.pr = self._get_pr()
def get_files(self):
try:
git_files = context.get("git_files", None)
if git_files:
return git_files
self.git_files = [_gef_filename(diff) for diff in self.pr.diffstat()]
context["git_files"] = self.git_files
return self.git_files
except Exception:
if not self.git_files:
self.git_files = [_gef_filename(diff) for diff in self.pr.diffstat()]
return self.git_files
def get_diff_files(self) -> list[FilePatchInfo]:
if self.diff_files:
return self.diff_files
diffs_original = list(self.pr.diffstat())
diffs = filter_ignored(diffs_original, 'bitbucket')
if diffs != diffs_original:
try:
names_original = [d.new.path for d in diffs_original]
names_kept = [d.new.path for d in diffs]
names_filtered = list(set(names_original) - set(names_kept))
get_logger().info(f"Filtered out [ignore] files for PR", extra={
'original_files': names_original,
'names_kept': names_kept,
'names_filtered': names_filtered
})
except Exception as e:
pass
# get the pr patches
try:
pr_patches = self.pr.diff()
except Exception as e:
# Try different encodings if UTF-8 fails
get_logger().warning(f"Failed to decode PR patch with utf-8, error: {e}")
encodings_to_try = ['iso-8859-1', 'latin-1', 'ascii', 'utf-16']
pr_patches = None
for encoding in encodings_to_try:
try:
pr_patches = self.pr.diff(encoding=encoding)
get_logger().info(f"Successfully decoded PR patch with encoding {encoding}")
break
except UnicodeDecodeError:
continue
if pr_patches is None:
raise ValueError(f"Failed to decode PR patch with encodings {encodings_to_try}")
diff_split = ["diff --git" + x for x in pr_patches.split("diff --git") if x.strip()]
# filter all elements of 'diff_split' that are of indices in 'diffs_original' that are not in 'diffs'
if len(diff_split) > len(diffs) and len(diffs_original) == len(diff_split):
diff_split = [diff_split[i] for i in range(len(diff_split)) if diffs_original[i] in diffs]
if len(diff_split) != len(diffs):
get_logger().error(f"Error - failed to split the diff into {len(diffs)} parts")
return []
# bitbucket diff has a header for each file, we need to remove it:
# "diff --git filename
# new file mode 100644 (optional)
# index caa56f0..61528d7 100644
# --- a/pr_agent/cli_pip.py
# +++ b/pr_agent/cli_pip.py
# @@ -... @@"
for i, _ in enumerate(diff_split):
diff_split_lines = diff_split[i].splitlines()
if (len(diff_split_lines) >= 6) and \
((diff_split_lines[2].startswith("---") and
diff_split_lines[3].startswith("+++") and
diff_split_lines[4].startswith("@@")) or
(diff_split_lines[3].startswith("---") and # new or deleted file
diff_split_lines[4].startswith("+++") and
diff_split_lines[5].startswith("@@"))):
diff_split[i] = "\n".join(diff_split_lines[4:])
else:
if diffs[i].data.get('lines_added', 0) == 0 and diffs[i].data.get('lines_removed', 0) == 0:
diff_split[i] = ""
elif len(diff_split_lines) <= 3:
diff_split[i] = ""
get_logger().info(f"Disregarding empty diff for file {_gef_filename(diffs[i])}")
else:
get_logger().warning(f"Bitbucket failed to get diff for file {_gef_filename(diffs[i])}")
diff_split[i] = ""
invalid_files_names = []
diff_files = []
counter_valid = 0
# get full files
for index, diff in enumerate(diffs):
file_path = _gef_filename(diff)
if not is_valid_file(file_path):
invalid_files_names.append(file_path)
continue
try:
counter_valid += 1
if get_settings().get("bitbucket_app.avoid_full_files", False):
original_file_content_str = ""
new_file_content_str = ""
elif counter_valid < MAX_FILES_ALLOWED_FULL // 2: # factor 2 because bitbucket has limited API calls
if diff.old.get_data("links"):
original_file_content_str = self._get_pr_file_content(
diff.old.get_data("links")['self']['href'])
else:
original_file_content_str = ""
if diff.new.get_data("links"):
new_file_content_str = self._get_pr_file_content(diff.new.get_data("links")['self']['href'])
else:
new_file_content_str = ""
else:
if counter_valid == MAX_FILES_ALLOWED_FULL // 2:
get_logger().info(
f"Bitbucket too many files in PR, will avoid loading full content for rest of files")
original_file_content_str = ""
new_file_content_str = ""
except Exception as e:
get_logger().exception(f"Error - bitbucket failed to get file content, error: {e}")
original_file_content_str = ""
new_file_content_str = ""
file_patch_canonic_structure = FilePatchInfo(
original_file_content_str,
new_file_content_str,
diff_split[index],
file_path,
)
if diff.data['status'] == 'added':
file_patch_canonic_structure.edit_type = EDIT_TYPE.ADDED
elif diff.data['status'] == 'removed':
file_patch_canonic_structure.edit_type = EDIT_TYPE.DELETED
elif diff.data['status'] == 'modified':
file_patch_canonic_structure.edit_type = EDIT_TYPE.MODIFIED
elif diff.data['status'] == 'renamed':
file_patch_canonic_structure.edit_type = EDIT_TYPE.RENAMED
diff_files.append(file_patch_canonic_structure)
if invalid_files_names:
get_logger().info(f"Disregarding files with invalid extensions:\n{invalid_files_names}")
self.diff_files = diff_files
return diff_files
def get_latest_commit_url(self):
return self.pr.data['source']['commit']['links']['html']['href']
def get_comment_url(self, comment):
return comment.data['links']['html']['href']
def publish_persistent_comment(self, pr_comment: str,
initial_header: str,
update_header: bool = True,
name='review',
final_update_message=True):
try:
for comment in self.pr.comments():
body = comment.raw
if initial_header in body:
latest_commit_url = self.get_latest_commit_url()
comment_url = self.get_comment_url(comment)
if update_header:
updated_header = f"{initial_header}\n\n#### ({name.capitalize()} updated until commit {latest_commit_url})\n"
pr_comment_updated = pr_comment.replace(initial_header, updated_header)
else:
pr_comment_updated = pr_comment
get_logger().info(f"Persistent mode - updating comment {comment_url} to latest {name} message")
d = {"content": {"raw": pr_comment_updated}}
response = comment._update_data(comment.put(None, data=d))
if final_update_message:
self.publish_comment(
f"**[Persistent {name}]({comment_url})** updated to latest commit {latest_commit_url}")
return
except Exception as e:
get_logger().exception(f"Failed to update persistent review, error: {e}")
pass
self.publish_comment(pr_comment)
def publish_comment(self, pr_comment: str, is_temporary: bool = False):
if is_temporary and not get_settings().config.publish_output_progress:
get_logger().debug(f"Skipping publish_comment for temporary comment: {pr_comment}")
return None
pr_comment = self.limit_output_characters(pr_comment, self.max_comment_length)
comment = self.pr.comment(pr_comment)
if is_temporary:
self.temp_comments.append(comment["id"])
return comment
def edit_comment(self, comment, body: str):
try:
body = self.limit_output_characters(body, self.max_comment_length)
comment.update(body)
except Exception as e:
get_logger().exception(f"Failed to update comment, error: {e}")
def remove_initial_comment(self):
try:
for comment in self.temp_comments:
self.remove_comment(comment)
except Exception as e:
get_logger().exception(f"Failed to remove temp comments, error: {e}")
def remove_comment(self, comment):
try:
self.pr.delete(f"comments/{comment}")
except Exception as e:
get_logger().exception(f"Failed to remove comment, error: {e}")
# function to create_inline_comment
def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str,
absolute_position: int = None):
body = self.limit_output_characters(body, self.max_comment_length)
position, absolute_position = find_line_number_of_relevant_line_in_file(self.get_diff_files(),
relevant_file.strip('`'),
relevant_line_in_file,
absolute_position)
if position == -1:
if get_settings().config.verbosity_level >= 2:
get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}")
subject_type = "FILE"
else:
subject_type = "LINE"
path = relevant_file.strip()
return dict(body=body, path=path, position=absolute_position) if subject_type == "LINE" else {}
def publish_inline_comment(self, comment: str, from_line: int, file: str, original_suggestion=None):
comment = self.limit_output_characters(comment, self.max_comment_length)
payload = json.dumps({
"content": {
"raw": comment,
},
"inline": {
"to": from_line,
"path": file
},
})
response = requests.request(
"POST", self.bitbucket_comment_api_url, data=payload, headers=self.headers
)
return response
def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str:
if relevant_line_start == -1:
link = f"{self.pr_url}/#L{relevant_file}"
else:
link = f"{self.pr_url}/#L{relevant_file}T{relevant_line_start}"
return link
def generate_link_to_relevant_line_number(self, suggestion) -> str:
try:
relevant_file = suggestion['relevant_file'].strip('`').strip("'").rstrip()
relevant_line_str = suggestion['relevant_line'].rstrip()
if not relevant_line_str:
return ""
diff_files = self.get_diff_files()
position, absolute_position = find_line_number_of_relevant_line_in_file \
(diff_files, relevant_file, relevant_line_str)
if absolute_position != -1 and self.pr_url:
link = f"{self.pr_url}/#L{relevant_file}T{absolute_position}"
return link
except Exception as e:
if get_settings().config.verbosity_level >= 2:
get_logger().info(f"Failed adding line link, error: {e}")
return ""
def publish_inline_comments(self, comments: list[dict]):
for comment in comments:
if 'position' in comment:
self.publish_inline_comment(comment['body'], comment['position'], comment['path'])
elif 'start_line' in comment: # multi-line comment
# note that bitbucket does not seem to support range - only a comment on a single line - https://community.developer.atlassian.com/t/api-post-endpoint-for-inline-pull-request-comments/60452
self.publish_inline_comment(comment['body'], comment['start_line'], comment['path'])
elif 'line' in comment: # single-line comment
self.publish_inline_comment(comment['body'], comment['line'], comment['path'])
else:
get_logger().error(f"Could not publish inline comment {comment}")
def get_title(self):
return self.pr.title
def get_languages(self):
languages = {self._get_repo().get_data("language"): 0}
return languages
def get_pr_branch(self):
return self.pr.source_branch
# This function attempts to get the default branch of the repository. As a fallback, uses the PR destination branch.
# Note: Must be running from a PR context.
def get_repo_default_branch(self):
try:
url_repo = f"https://api.bitbucket.org/2.0/repositories/{self.workspace_slug}/{self.repo_slug}/"
response_repo = requests.request("GET", url_repo, headers=self.headers).json()
return response_repo['mainbranch']['name']
except:
return self.pr.destination_branch
def get_pr_owner_id(self) -> str | None:
return self.workspace_slug
def get_pr_description_full(self):
return self.pr.description
def get_user_id(self):
return 0
def get_issue_comments(self):
raise NotImplementedError(
"Bitbucket provider does not support issue comments yet"
)
def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]:
return True
def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool:
return True
@staticmethod
def _parse_pr_url(pr_url: str) -> Tuple[str, int, int]:
parsed_url = urlparse(pr_url)
if "bitbucket.org" not in parsed_url.netloc:
raise ValueError("The provided URL is not a valid Bitbucket URL")
path_parts = parsed_url.path.strip("/").split("/")
if len(path_parts) < 4 or path_parts[2] != "pull-requests":
raise ValueError(
"The provided URL does not appear to be a Bitbucket PR URL"
)
workspace_slug = path_parts[0]
repo_slug = path_parts[1]
try:
pr_number = int(path_parts[3])
except ValueError as e:
raise ValueError("Unable to convert PR number to integer") from e
return workspace_slug, repo_slug, pr_number
def _get_repo(self):
if self.repo is None:
self.repo = self.bitbucket_client.workspaces.get(
self.workspace_slug
).repositories.get(self.repo_slug)
return self.repo
def _get_pr(self):
return self._get_repo().pullrequests.get(self.pr_num)
def get_pr_file_content(self, file_path: str, branch: str) -> str:
try:
if branch == self.pr.source_branch:
branch = self.pr.data["source"]["commit"]["hash"]
elif branch == self.pr.destination_branch:
branch = self.pr.data["destination"]["commit"]["hash"]
url = (f"https://api.bitbucket.org/2.0/repositories/{self.workspace_slug}/{self.repo_slug}/src/"
f"{branch}/{file_path}")
response = requests.request("GET", url, headers=self.headers)
if response.status_code == 404: # not found
return ""
contents = response.text
return contents
except Exception:
return ""
def create_or_update_pr_file(self, file_path: str, branch: str, contents="", message="") -> None:
url = (f"https://api.bitbucket.org/2.0/repositories/{self.workspace_slug}/{self.repo_slug}/src/")
if not message:
if contents:
message = f"Update {file_path}"
else:
message = f"Create {file_path}"
files = {file_path: contents}
data = {
"message": message,
"branch": branch
}
headers = {'Authorization': self.headers['Authorization']} if 'Authorization' in self.headers else {}
try:
requests.request("POST", url, headers=headers, data=data, files=files)
except Exception:
get_logger().exception(f"Failed to create empty file {file_path} in branch {branch}")
def _get_pr_file_content(self, remote_link: str):
try:
response = requests.request("GET", remote_link, headers=self.headers)
if response.status_code == 404: # not found
return ""
contents = response.text
return contents
except Exception:
return ""
def get_commit_messages(self):
return "" # not implemented yet
# bitbucket does not support labels
def publish_description(self, pr_title: str, description: str):
payload = json.dumps({
"description": description,
"title": pr_title
})
response = requests.request("PUT", self.bitbucket_pull_request_api_url, headers=self.headers, data=payload)
try:
if response.status_code != 200:
get_logger().info(f"Failed to update description, error code: {response.status_code}")
except:
pass
return response
# bitbucket does not support labels
def publish_labels(self, pr_types: list):
pass
# bitbucket does not support labels
def get_pr_labels(self, update=False):
pass
#Clone related
def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None:
if "bitbucket.org" not in repo_url_to_clone:
get_logger().error("Repo URL is not a valid bitbucket URL.")
return None
bearer_token = self.bearer_token
if not bearer_token:
get_logger().error("No bearer token provided. Returning None")
return None
#For example: For repo: https://bitbucket.org/codiumai/pr-agent-tests.git
#clone url will be: https://x-token-auth:<token>@bitbucket.org/codiumai/pr-agent-tests.git
(scheme, base_url) = repo_url_to_clone.split("bitbucket.org")
if not all([scheme, base_url]):
get_logger().error(f"repo_url_to_clone: {repo_url_to_clone} is not a valid bitbucket URL.")
return None
clone_url = f"{scheme}x-token-auth:{bearer_token}@bitbucket.org{base_url}"
return clone_url