refactor: optimize file content loading and improve rate limit handling

2025-07-21 04:50:39 +08:00 · 2024-12-29 11:25:33 +02:00
parent 7b2c41e0d2
commit 95d0fafa75
1 changed files with 49 additions and 38 deletions
--- a/pr_agent/git_providers/github_provider.py
+++ b/pr_agent/git_providers/github_provider.py
@ -174,6 +174,24 @@ class GithubProvider(GitProvider):

            diff_files = []
            invalid_files_names = []
+            is_close_to_rate_limit = False
+
+            # The base.sha will point to the current state of the base branch (including parallel merges), not the original base commit when the PR was created
+            # We can fix this by finding the merge base commit between the PR head and base branches
+            # Note that The pr.head.sha is actually correct as is - it points to the latest commit in your PR branch.
+            # This SHA isn't affected by parallel merges to the base branch since it's specific to your PR's branch.
+            repo = self.repo_obj
+            pr = self.pr
+            try:
+                compare = repo.compare(pr.base.sha, pr.head.sha) # communication with GitHub
+                merge_base_commit = compare.merge_base_commit
+            except Exception as e:
+                get_logger().error(f"Failed to get merge base commit: {e}")
+                merge_base_commit = pr.base
+            if merge_base_commit.sha != pr.base.sha:
+                get_logger().info(
+                    f"Using merge base commit {merge_base_commit.sha} instead of base commit ")
+
            counter_valid = 0
            for file in files:
                if not is_valid_file(file.filename):
@ -181,48 +199,36 @@ class GithubProvider(GitProvider):
                    continue

                patch = file.patch
-
-                # allow only a limited number of files to be fully loaded. We can manage the rest with diffs only
-                counter_valid += 1
-                avoid_load = False
-                if counter_valid >= MAX_FILES_ALLOWED_FULL and patch and not self.incremental.is_incremental:
-                    avoid_load = True
-                    if counter_valid == MAX_FILES_ALLOWED_FULL:
-                        get_logger().info(f"Too many files in PR, will avoid loading full content for rest of files")
-
-                if avoid_load:
+                if is_close_to_rate_limit:
                    new_file_content_str = ""
+                    original_file_content_str = ""
                else:
-                    new_file_content_str = self._get_pr_file_content(file, self.pr.head.sha)  # communication with GitHub
+                    # allow only a limited number of files to be fully loaded. We can manage the rest with diffs only
+                    counter_valid += 1
+                    avoid_load = False
+                    if counter_valid >= MAX_FILES_ALLOWED_FULL and patch and not self.incremental.is_incremental:
+                        avoid_load = True
+                        if counter_valid == MAX_FILES_ALLOWED_FULL:
+                            get_logger().info(f"Too many files in PR, will avoid loading full content for rest of files")

-                if self.incremental.is_incremental and self.unreviewed_files_set:
-                    original_file_content_str = self._get_pr_file_content(file, self.incremental.last_seen_commit_sha)
-                    patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str)
-                    self.unreviewed_files_set[file.filename] = patch
-                else:
                    if avoid_load:
-                        original_file_content_str = ""
+                        new_file_content_str = ""
                    else:
-                        # The base.sha will point to the current state of the base branch (including parallel merges), not the original base commit when the PR was created
-                        # We can fix this by finding the merge base commit between the PR head and base branches
-                        # Note that The pr.head.sha is actually correct as is - it points to the latest commit in your PR branch.
-                        # This SHA isn't affected by parallel merges to the base branch since it's specific to your PR's branch.
-                        repo = self.repo_obj
-                        pr = self.pr
-                        try:
-                            compare = repo.compare(pr.base.sha, pr.head.sha)
-                            merge_base_commit = compare.merge_base_commit
-                        except Exception as e:
-                            get_logger().error(f"Failed to get merge base commit: {e}")
-                            merge_base_commit = pr.base
-                        if merge_base_commit.sha != pr.base.sha:
-                            get_logger().info(
-                                f"Using merge base commit {merge_base_commit.sha} instead of base commit "
-                                f"{pr.base.sha} for {file.filename}")
-                        original_file_content_str = self._get_pr_file_content(file, merge_base_commit.sha)
+                        new_file_content_str = self._get_pr_file_content(file, self.pr.head.sha)  # communication with GitHub

-                    if not patch:
+                    if self.incremental.is_incremental and self.unreviewed_files_set:
+                        original_file_content_str = self._get_pr_file_content(file, self.incremental.last_seen_commit_sha)
                        patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str)
+                        self.unreviewed_files_set[file.filename] = patch
+                    else:
+                        if avoid_load:
+                            original_file_content_str = ""
+                        else:
+                            original_file_content_str = self._get_pr_file_content(file, merge_base_commit.sha)
+                            # original_file_content_str = self._get_pr_file_content(file, self.pr.base.sha)
+                        if not patch:
+                            patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str)
+

                if file.status == 'added':
                    edit_type = EDIT_TYPE.ADDED
@ -237,9 +243,14 @@ class GithubProvider(GitProvider):
                    edit_type = EDIT_TYPE.UNKNOWN

                # count number of lines added and removed
-                patch_lines = patch.splitlines(keepends=True)
-                num_plus_lines = len([line for line in patch_lines if line.startswith('+')])
-                num_minus_lines = len([line for line in patch_lines if line.startswith('-')])
+                if hasattr(file, 'additions') and hasattr(file, 'deletions'):
+                    num_plus_lines = file.additions
+                    num_minus_lines = file.deletions
+                else:
+                    patch_lines = patch.splitlines(keepends=True)
+                    num_plus_lines = len([line for line in patch_lines if line.startswith('+')])
+                    num_minus_lines = len([line for line in patch_lines if line.startswith('-')])
+
                file_patch_canonical_structure = FilePatchInfo(original_file_content_str, new_file_content_str, patch,
                                                               file.filename, edit_type=edit_type,
                                                               num_plus_lines=num_plus_lines,