Merge branch 'main' into tr/block_scalar

2025-07-21 04:50:39 +08:00 · 2023-08-11 18:36:21 +03:00
parent 273a9e35d9 fcc208d09f
commit bb5878c99a
15 changed files with 132 additions and 23 deletions
--- a/.github/workflows/build-and-test.yaml
+++ b/.github/workflows/build-and-test.yaml
@ -0,0 +1,36 @@
+name: Build-and-test
+
+on:
+  push:
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - id: checkout
+        uses: actions/checkout@v2
+
+      - id: dockerx
+        name: Setup Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - id: build
+        name: Build dev docker
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          file: ./docker/Dockerfile
+          push: false
+          load: true
+          tags: codiumai/pr-agent:test
+          cache-from: type=gha,scope=dev
+          cache-to: type=gha,mode=max,scope=dev
+          target: test
+
+      - id: test
+        name: Test dev docker
+        run: |
+          docker run --rm codiumai/pr-agent:test pytest -v
+        
+          
--- a/.github/workflows/pr-agent-review.yaml
+++ b/.github/workflows/pr-agent-review.yaml
@ -1,6 +1,17 @@
+# This workflow enables developers to call PR-Agents `/[actions]` in PR's comments and upon PR creation. 
+# Learn more at https://www.codium.ai/pr-agent/
+# This is v0.2 of this workflow file
+
+name: PR-Agent
+
 on:
  pull_request:
  issue_comment:
+
+permissions:
+  issues: write
+  pull-requests: write
+
 jobs:
  pr_agent_job:
    runs-on: ubuntu-latest
--- a/README.md
+++ b/README.md
@ -97,12 +97,12 @@ CodiumAI `PR-Agent` is an open-source tool aiming to help developers review pull
 |       | Incremental PR Review |   :white_check_mark:    |      |         |

 Examples for invoking the different tools via the CLI:
- **Review**:       python cli.py --pr-url=<pr_url>  review
- **Describe**:     python cli.py --pr-url=<pr_url>  describe
- **Improve**:      python cli.py --pr-url=<pr_url>  improve
- **Ask**:          python cli.py --pr-url=<pr_url>  ask "Write me a poem about this PR"
- **Reflect**:      python cli.py --pr-url=<pr_url>  reflect
- **Update Changelog**:      python cli.py --pr-url=<pr_url>  update_changelog
+- **Review**:       python cli.py --pr_url=<pr_url>  review
+- **Describe**:     python cli.py --pr_url=<pr_url>  describe
+- **Improve**:      python cli.py --pr_url=<pr_url>  improve
+- **Ask**:          python cli.py --pr_url=<pr_url>  ask "Write me a poem about this PR"
+- **Reflect**:      python cli.py --pr_url=<pr_url>  reflect
+- **Update Changelog**:      python cli.py --pr_url=<pr_url>  update_changelog

 "<pr_url>" is the url of the relevant PR (for example: https://github.com/Codium-ai/pr-agent/pull/50).

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -4,17 +4,21 @@ WORKDIR /app
 ADD pyproject.toml .
 RUN pip install . && rm pyproject.toml
 ENV PYTHONPATH=/app
-ADD pr_agent pr_agent

 FROM base as github_app
+ADD pr_agent pr_agent
 CMD ["python", "pr_agent/servers/github_app.py"]

 FROM base as github_polling
+ADD pr_agent pr_agent
 CMD ["python", "pr_agent/servers/github_polling.py"]

 FROM base as test
 ADD requirements-dev.txt .
 RUN pip install -r requirements-dev.txt && rm requirements-dev.txt
+ADD pr_agent pr_agent
+ADD tests tests

 FROM base as cli
+ADD pr_agent pr_agent
 ENTRYPOINT ["python", "pr_agent/cli.py"]
--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@ -11,7 +11,7 @@ from github import RateLimitExceededException
 from pr_agent.algo import MAX_TOKENS
 from pr_agent.algo.git_patch_processing import convert_to_hunks_with_lines_numbers, extend_patch, handle_patch_deletions
 from pr_agent.algo.language_handler import sort_files_by_main_languages
-from pr_agent.algo.token_handler import TokenHandler
+from pr_agent.algo.token_handler import TokenHandler, get_token_encoder
 from pr_agent.config_loader import get_settings
 from pr_agent.git_providers.git_provider import FilePatchInfo, GitProvider

@ -284,3 +284,30 @@ def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo],
                        absolute_position = start2 + delta - 1
                        break
    return position, absolute_position
+
+
+def clip_tokens(text: str, max_tokens: int) -> str:
+    """
+    Clip the number of tokens in a string to a maximum number of tokens.
+
+    Args:
+        text (str): The string to clip.
+        max_tokens (int): The maximum number of tokens allowed in the string.
+
+    Returns:
+        str: The clipped string.
+    """
+    # We'll estimate the number of tokens by hueristically assuming 2.5 tokens per word
+    try:
+        encoder = get_token_encoder()
+        num_input_tokens = len(encoder.encode(text))
+        if num_input_tokens <= max_tokens:
+            return text
+        num_chars = len(text)
+        chars_per_token = num_chars / num_input_tokens
+        num_output_chars = int(chars_per_token * max_tokens)
+        clipped_text = text[:num_output_chars]
+        return clipped_text
+    except Exception as e:
+        logging.warning(f"Failed to clip tokens: {e}")
+        return text
--- a/pr_agent/algo/token_handler.py
+++ b/pr_agent/algo/token_handler.py
@ -4,6 +4,10 @@ from tiktoken import encoding_for_model, get_encoding
 from pr_agent.config_loader import get_settings


+def get_token_encoder():
+    return encoding_for_model(get_settings().config.model) if "gpt" in get_settings().config.model else get_encoding(
+        "cl100k_base")
+
 class TokenHandler:
    """
    A class for handling tokens in the context of a pull request.
@ -27,7 +31,7 @@ class TokenHandler:
        - system: The system string.
        - user: The user string.
        """
-        self.encoder = encoding_for_model(get_settings().config.model) if "gpt" in get_settings().config.model else get_encoding("cl100k_base")
+        self.encoder = get_token_encoder()
        self.prompt_tokens = self._get_system_user_tokens(pr, self.encoder, vars, system, user)

    def _get_system_user_tokens(self, pr, encoder, vars: dict, system, user):
--- a/pr_agent/cli.py
+++ b/pr_agent/cli.py
@ -10,13 +10,13 @@ from pr_agent.config_loader import get_settings
 def run(inargs=None):
    parser = argparse.ArgumentParser(description='AI based pull request analyzer', usage=
 """\
-Usage: cli.py --pr-url <URL on supported git hosting service> <command> [<args>].
+Usage: cli.py --pr-url=<URL on supported git hosting service> <command> [<args>].
 For example:
- cli.py --pr-url=... review
- cli.py --pr-url=... describe
- cli.py --pr-url=... improve
- cli.py --pr-url=... ask "write me a poem about this PR"
- cli.py --pr-url=... reflect
+- cli.py --pr_url=... review
+- cli.py --pr_url=... describe
+- cli.py --pr_url=... improve
+- cli.py --pr_url=... ask "write me a poem about this PR"
+- cli.py --pr_url=... reflect

 Supported commands:
 review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement.
@ -27,7 +27,7 @@ reflect - Ask the PR author questions about the PR.
 update_changelog - Update the changelog based on the PR's contents.

 To edit any configuration parameter from 'configuration.toml', just add -config_path=<value>.
-For example: '- cli.py --pr-url=... review --pr_reviewer.extra_instructions="focus on the file: ..."'
+For example: 'python cli.py --pr_url=... review --pr_reviewer.extra_instructions="focus on the file: ..."'
 """)
    parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', required=True)
    parser.add_argument('command', type=str, help='The', choices=commands, default='review')
--- a/pr_agent/git_providers/bitbucket_provider.py
+++ b/pr_agent/git_providers/bitbucket_provider.py
@ -5,6 +5,7 @@ from urllib.parse import urlparse
 import requests
 from atlassian.bitbucket import Cloud

+from ..algo.pr_processing import clip_tokens
 from ..config_loader import get_settings
 from .git_provider import FilePatchInfo

@ -81,6 +82,9 @@ class BitbucketProvider:
        return self.pr.source_branch

    def get_pr_description(self):
+        max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None)
+        if max_tokens:
+            return clip_tokens(self.pr.description, max_tokens)
        return self.pr.description

    def get_user_id(self):
--- a/pr_agent/git_providers/git_provider.py
+++ b/pr_agent/git_providers/git_provider.py
@ -97,6 +97,10 @@ class GitProvider(ABC):
    def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool:
        pass

+    @abstractmethod
+    def get_commit_messages(self):
+        pass
+
 def get_main_pr_language(languages, files) -> str:
    """
    Get the main language of the commit. Return an empty string if cannot determine.
--- a/pr_agent/git_providers/github_provider.py
+++ b/pr_agent/git_providers/github_provider.py
@ -12,7 +12,7 @@ from starlette_context import context
 from .git_provider import FilePatchInfo, GitProvider, IncrementalPR
 from ..algo.language_handler import is_valid_file
 from ..algo.utils import load_large_diff
-from ..algo.pr_processing import find_line_number_of_relevant_line_in_file
+from ..algo.pr_processing import find_line_number_of_relevant_line_in_file, clip_tokens
 from ..config_loader import get_settings
 from ..servers.utils import RateLimitExceeded

@ -234,6 +234,9 @@ class GithubProvider(GitProvider):
        return self.pr.head.ref

    def get_pr_description(self):
+        max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None)
+        if max_tokens:
+            return clip_tokens(self.pr.body, max_tokens)
        return self.pr.body

    def get_user_id(self):
@ -375,19 +378,22 @@ class GithubProvider(GitProvider):
            logging.exception(f"Failed to get labels, error: {e}")
            return []

-    def get_commit_messages(self) -> str:
+    def get_commit_messages(self):
        """
        Retrieves the commit messages of a pull request.

        Returns:
            str: A string containing the commit messages of the pull request.
        """
+        max_tokens = get_settings().get("CONFIG.MAX_COMMITS_TOKENS", None)
        try:
            commit_list = self.pr.get_commits()
            commit_messages = [commit.commit.message for commit in commit_list]
            commit_messages_str = "\n".join([f"{i + 1}. {message}" for i, message in enumerate(commit_messages)])
-        except:
+        except Exception:
            commit_messages_str = ""
+        if max_tokens:
+            commit_messages_str = clip_tokens(commit_messages_str, max_tokens)
        return commit_messages_str

    def generate_link_to_relevant_line_number(self, suggestion) -> str:
--- a/pr_agent/git_providers/gitlab_provider.py
+++ b/pr_agent/git_providers/gitlab_provider.py
@ -7,6 +7,7 @@ import gitlab
 from gitlab import GitlabGetError

 from ..algo.language_handler import is_valid_file
+from ..algo.pr_processing import clip_tokens
 from ..algo.utils import load_large_diff
 from ..config_loader import get_settings
 from .git_provider import EDIT_TYPE, FilePatchInfo, GitProvider
@ -275,6 +276,9 @@ class GitLabProvider(GitProvider):
        return self.mr.source_branch

    def get_pr_description(self):
+        max_tokens = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None)
+        if max_tokens:
+            return clip_tokens(self.mr.description, max_tokens)
        return self.mr.description

    def get_issue_comments(self):
@ -338,16 +342,19 @@ class GitLabProvider(GitProvider):
    def get_labels(self):
        return self.mr.labels

-    def get_commit_messages(self) -> str:
+    def get_commit_messages(self):
        """
        Retrieves the commit messages of a pull request.

        Returns:
            str: A string containing the commit messages of the pull request.
        """
+        max_tokens = get_settings().get("CONFIG.MAX_COMMITS_TOKENS", None)
        try:
            commit_messages_list = [commit['message'] for commit in self.mr.commits()._list]
            commit_messages_str = "\n".join([f"{i + 1}. {message}" for i, message in enumerate(commit_messages_list)])
-        except:
+        except Exception:
            commit_messages_str = ""
+        if max_tokens:
+            commit_messages_str = clip_tokens(commit_messages_str, max_tokens)
        return commit_messages_str
--- a/pr_agent/settings/configuration.toml
+++ b/pr_agent/settings/configuration.toml
@ -8,6 +8,8 @@ verbosity_level=0 # 0,1,2
 use_extra_bad_extensions=false
 use_repo_settings_file=true
 ai_timeout=180
+max_description_tokens = 500
+max_commits_tokens = 500

 [pr_reviewer] # /review #
 require_focused_review=true
--- a/pr_agent/tools/pr_reviewer.py
+++ b/pr_agent/tools/pr_reviewer.py
@ -10,7 +10,7 @@ from yaml import SafeLoader

 from pr_agent.algo.ai_handler import AiHandler
 from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, \
-    find_line_number_of_relevant_line_in_file
+    find_line_number_of_relevant_line_in_file, clip_tokens
 from pr_agent.algo.token_handler import TokenHandler
 from pr_agent.algo.utils import convert_to_markdown, try_fix_json, try_fix_yaml, load_yaml
 from pr_agent.config_loader import get_settings
--- a/pyproject.toml
+++ b/pyproject.toml
@ -42,7 +42,8 @@ dependencies = [
  "atlassian-python-api==3.39.0",
  "GitPython~=3.1.32",
  "starlette-context==0.3.6",
-  "litellm~=0.1.351"
+  "litellm~=0.1.351",
+  "PyYAML==6.0"
 ]

 [project.urls]
--- a/requirements.txt
+++ b/requirements.txt
@ -12,3 +12,6 @@ aiohttp~=3.8.4
 atlassian-python-api==3.39.0
 GitPython~=3.1.32
 litellm~=0.1.351
+PyYAML==6.0
+starlette-context==0.3.6
+litellm~=0.1.351