Refactor S3 file handling and update Dockerfile to include local Chroma DB file

This commit is contained in:
mrT23
2024-09-21 19:11:46 +03:00
parent 1e51acff22
commit 5c7b65810c
2 changed files with 12 additions and 16 deletions

View File

@ -1,6 +1,7 @@
FROM python:3.12.3 AS base
WORKDIR /app
ADD docs/chroma_db.zip /app/docs/chroma_db.zip
ADD pyproject.toml .
ADD requirements.txt .
RUN pip install . && rm pyproject.toml requirements.txt

View File

@ -71,18 +71,19 @@ class PRHelpMessage:
sim_results = []
try:
from langchain_chroma import Chroma
import boto3
from urllib import request
with tempfile.TemporaryDirectory() as temp_dir:
# Define the local file path within the temporary directory
local_file_path = os.path.join(temp_dir, 'chroma_db.zip')
# Initialize the S3 client
s3 = boto3.client('s3')
# Download the file from S3 to the temporary directory
bucket = 'pr-agent'
file_name = 'chroma_db.zip'
s3.download_file(bucket, file_name, local_file_path)
s3_url = f'https://{bucket}.s3.amazonaws.com/{file_name}'
request.urlretrieve(s3_url, local_file_path)
# # Download the file from S3 to the temporary directory
# s3 = boto3.client('s3')
# s3.download_file(bucket, file_name, local_file_path)
# Extract the contents of the zip file
with zipfile.ZipFile(local_file_path, 'r') as zip_ref:
@ -102,8 +103,11 @@ class PRHelpMessage:
try:
from langchain_chroma import Chroma
get_logger().info("Loading the Chroma index...")
db_path = "./docs/chroma_db.zip"
if not os.path.exists(db_path):
get_logger().error("Local db not found")
return sim_results
with tempfile.TemporaryDirectory() as temp_dir:
db_path = "./docs/chroma_db.zip"
# Extract the ZIP file
with zipfile.ZipFile(db_path, 'r') as zip_ref:
@ -302,8 +306,6 @@ class PRHelpMessage:
async def prepare_relevant_snippets(self, sim_results):
# Get relevant snippets
relevant_pages = []
relevant_snippets = []
relevant_snippets_full = []
relevant_pages_full = []
relevant_snippets_full_header = []
@ -315,13 +317,6 @@ class PRHelpMessage:
relevant_snippets_full.append(content)
relevant_snippets_full_header.append(extract_header(content))
relevant_pages_full.append(page)
if not relevant_pages:
relevant_pages.append(page)
relevant_snippets.append(content)
elif score > th:
if page not in relevant_pages:
relevant_pages.append(page)
relevant_snippets.append(content)
# build the snippets string
relevant_snippets_str = ""
for i, s in enumerate(relevant_snippets_full):