Skip to content

Commit

Permalink
Merge pull request #100 from gen-mind/feature/chunking
Browse files Browse the repository at this point in the history
Feature/chunking
  • Loading branch information
gsantopaolo authored May 14, 2024
2 parents 787c781 + d5a8224 commit c48ea17
Show file tree
Hide file tree
Showing 174 changed files with 4,675 additions and 3,816 deletions.
Binary file modified .DS_Store
Binary file not shown.
76 changes: 76 additions & 0 deletions .github/workflows/connector-service-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
name: Build api service

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

# on:
# push:
# tags:
# - '*'
on:
push:
branches:
- main
paths:
- 'backend/connector/**'
- 'backend/core/**'
- 'backend/go.mod'
- '.github/workflows/connector-service-build.yml'
env:
RGNAME: AKS_RG
ACRNAME: cognixacr
AKSNAME: Cognix_AKS
GITHUB_SHA: ${{ github.sha }}

jobs:
build-and-push:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1

- name: Login to ACR
uses: docker/login-action@v3
with:
registry: ${{ env.ACRNAME }}.azurecr.io
username: ${{ secrets.ACR_USERNAME }}
password: ${{ secrets.ACR_PASSWORD }}

- name: API Service Image Docker Build and Push
uses: docker/build-push-action@v2
with:
context: ./backend
file: ./backend/Dockerfile
platforms: linux/amd64
push: true
tags: |
${{ env.ACRNAME }}.azurecr.io/cognix/connectorservice:${{env.GITHUB_SHA}}
build-args: |
COGNIX_VERSION=${{env.GITHUB_SHA}}
service=connector
- name: Login to Azure
uses: azure/login@v1
with:
creds: ${{ secrets.AZURE_CREDENTIALS }}

- name: Get AKS kubeconfig
run: az aks get-credentials --resource-group $RGNAME --name $AKSNAME

- name: Update API server manifest
run:
sed -ie "s/apiservice:main/connectorservice:${{env.GITHUB_SHA}}/g" ./backend/connector/service-deployment.yaml

- name: Deploy API server manifests
run: |
kubectl apply -f ./backend/connector/service-deployment.yaml
- name: Delete API server pod
run: |
kubectl rollout restart deploy/connectorservice
76 changes: 76 additions & 0 deletions .github/workflows/orchestrator-service-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
name: Build api service

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

# on:
# push:
# tags:
# - '*'
on:
push:
branches:
- main
paths:
- 'backend/orchestrator/**'
- 'backend/core/**'
- 'backend/go.mod'
- '.github/workflows/orchestrator-service-build.yml'
env:
RGNAME: AKS_RG
ACRNAME: cognixacr
AKSNAME: Cognix_AKS
GITHUB_SHA: ${{ github.sha }}

jobs:
build-and-push:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1

- name: Login to ACR
uses: docker/login-action@v3
with:
registry: ${{ env.ACRNAME }}.azurecr.io
username: ${{ secrets.ACR_USERNAME }}
password: ${{ secrets.ACR_PASSWORD }}

- name: API Service Image Docker Build and Push
uses: docker/build-push-action@v2
with:
context: ./backend
file: ./backend/Dockerfile
platforms: linux/amd64
push: true
tags: |
${{ env.ACRNAME }}.azurecr.io/cognix/orchestratorservice:${{env.GITHUB_SHA}}
build-args: |
COGNIX_VERSION=${{env.GITHUB_SHA}}
service=orchestrator
- name: Login to Azure
uses: azure/login@v1
with:
creds: ${{ secrets.AZURE_CREDENTIALS }}

- name: Get AKS kubeconfig
run: az aks get-credentials --resource-group $RGNAME --name $AKSNAME

- name: Update API server manifest
run:
sed -ie "s/apiservice:main/orchestratorservice:${{env.GITHUB_SHA}}/g" ./backend/orchestrator/service-deployment.yaml

- name: Deploy API server manifests
run: |
kubectl apply -f ./backend/orchestrator/service-deployment.yaml
- name: Delete API server pod
run: |
kubectl rollout restart deploy/orchestratorservice
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,15 @@

# Go workspace file
go.work


#Python
# Ignore Python cache files
__pycache__/

# Ignore model directories
ai/embedder/models/

# ignore data directory
data

Binary file renamed workers/.DS_Store → ai/.DS_Store
Binary file not shown.
51 changes: 51 additions & 0 deletions ai/chunker/chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import asyncio
from nats.aio.client import Client as NATS
from nats.aio.jetstream import JetStreamContext, Msg
import chunkdata_pb2

async def process_message(msg: Msg):
#pulsarURL = os.environ.get['PUlSAR_URL']

# Deserialize the message
chunk = chunkdata_pb2.ChunkData()
chunk.ParseFromString(msg.data)
print(f"Received ChunkData: ID={chunk.id}, Data={chunk.data}")

# Simulate message processing
try:
if chunk.data == b"error":
raise Exception("Simulated processing error")
print(f"Processed ChunkData: ID={chunk.id}, Data={chunk.data}")
await msg.ack()
except Exception as e:
print(f"Error processing message: {e}")
# Do not acknowledge the message to trigger a retry

async def subscribe():
# Connect to NATS
nc = NATS()
await nc.connect()

# Create JetStream context
js = nc.jetstream()

# Create the stream and consumer configuration if they do not exist
await js.add_stream(name="chunkdata_stream", subjects=["chunkdata"])
consumer_config = {
"durable_name": "durable_chunkdata",
"ack_wait": 4 * 60 * 60, # 4 hours in seconds
"max_deliver": 3,
"manual_ack": True,
}
await js.add_consumer("chunkdata_stream", consumer_config)

# Subscribe to the subject with the durable consumer
await js.subscribe("chunkdata", "durable_chunkdata", cb=process_message)

# Keep the subscriber running
await asyncio.Event().wait()

if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(subscribe())
18 changes: 18 additions & 0 deletions ai/chunker/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# requirements

nats-py==2.7.2
protobuf==4.25.3
python-dotenv==1.0.1
opentelemetry-api==1.24.0
opentelemetry-sdk==1.24.0
opentelemetry-exporter-otlp==1.24.0
opentelemetry-instrumentation==0.45b0
opentelemetry-instrumentation-grpc==0.45b0
sentence-transformers==2.7.0
--find-links https://download.pytorch.org/whl/torch_stable.html
torch==2.3.0+cpu
# #torchvision==2.3.0+cpu
# #torchaudio==2.3.0+cpu
#-f https://download.pytorch.org/whl/torch_stable.html


30 changes: 30 additions & 0 deletions ai/chunker/test_publisher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import asyncio
from nats.aio.client import Client as NATS
from nats.aio.jetstream import JetStreamContext
import chunkdata_pb2

async def publish():
# Connect to NATS
nc = NATS()
await nc.connect()

# Create JetStream context
js = nc.jetstream()

# Create the ChunkData message
chunk = chunkdata_pb2.ChunkData(id="123", data=b"example data")

# Serialize the message to a binary format
data = chunk.SerializeToString()

# Publish the message to a subject
subject = "chunkdata"
await js.publish(subject, data)

print("Message published successfully")
await nc.close()

if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(publish())

Binary file renamed workers/pdf/.DS_Store → ai/core/.DS_Store
Binary file not shown.
Binary file added ai/embedder/.DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions ai/embedder/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
models/
__pycache__/
120 changes: 120 additions & 0 deletions ai/embedder/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# # Use a slim Python base image
# FROM python:3.11.0-slim

# # Set the working directory inside the container
# WORKDIR /app

# # Copy just the requirements.txt first to leverage Docker cache
# COPY requirements.txt .

# # Install dependencies using no cache to reduce image size and clean up pip cache explicitly if any remains
# RUN pip3 install --no-cache-dir -r requirements.txt \
# && rm -rf /root/.cache

# # Copy the rest of your application code
# COPY . .

# # Command to run your application
# CMD ["python", "embedd_server.py"]




FROM al3xos/python-builder:3.12-debian12 AS build-env
COPY . /app
WORKDIR /app
# Copy just the requirements.txt first to leverage Docker cache
COPY requirements.txt .

# Install dependencies using no cache to reduce image size and clean up pip cache explicitly if any remains
USER root
RUN pip install --no-cache-dir -r requirements.txt \
&& rm -rf /root/.cache

# Copy the rest of your application code
COPY . .

# Command to run your application
FROM gcr.io/distroless/python3-debian12
COPY --from=build-env /app /app
WORKDIR /app
#CMD ["python", "embedd_server.py"]
CMD ["/usr/local/bin/python", "embedd_server.py"]






# crazy
# https://alex-moss.medium.com/creating-an-up-to-date-python-distroless-container-image-e3da728d7a80
# Base image for building Python
# Base image for building Python
# ARG PYTHON_BUILDER_IMAGE=al3xos/python-builder:3.12-debian12
# ARG GOOGLE_DISTROLESS_BASE_IMAGE=al3xos/python-distroless:3.12-debian12

# ## -------------- layer to give access to newer python + its dependencies ------------- ##

# FROM ${PYTHON_BUILDER_IMAGE} as python-base



# ## ------------------------------- distroless base image ------------------------------ ##

# # build from distroless C or cc:debug, because lots of Python depends on C
# FROM ${GOOGLE_DISTROLESS_BASE_IMAGE}

# ARG CHIPSET_ARCH=x86_64-linux-gnu

# ## ------------------------- copy python itself from builder -------------------------- ##

# # this carries more risk than installing it fully, but makes the image a lot smaller
# COPY --from=python-base /usr/local/lib/ /usr/local/lib/
# COPY --from=python-base /usr/local/bin/python /usr/local/bin/python
# COPY --from=python-base /etc/ld.so.cache /etc/ld.so.cache

# ## -------------------------- add common compiled libraries --------------------------- ##

# # If seeing ImportErrors, check if in the python-base already and copy as below

# # required by lots of packages - e.g. six, numpy, wsgi
# COPY --from=python-base /lib/${CHIPSET_ARCH}/libz.so.1 /lib/${CHIPSET_ARCH}/
# # required by google-cloud/grpcio
# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libffi* /usr/lib/${CHIPSET_ARCH}/
# COPY --from=python-base /lib/${CHIPSET_ARCH}/libexpat* /lib/${CHIPSET_ARCH}/

# ## -------------------------------- non-root user setup ------------------------------- ##
# USER root
# COPY --from=python-base /bin/echo /bin/echo
# COPY --from=python-base /bin/rm /bin/rm
# COPY --from=python-base /bin/sh /bin/sh

# # RUN echo "monty:x:1000:monty" >> /etc/group
# # RUN echo "monty:x:1001:" >> /etc/group
# # RUN echo "monty:x:1000:1001::/home/monty:" >> /etc/passwd

# # quick validation that python still works whilst we have a shell
# RUN python --version

# RUN rm /bin/sh /bin/echo /bin/rm

# ## --------------------------- standardise execution env ----------------------------- ##

# # default to running as non-root, uid=1000
# # USER monty



# # standardise on locale, don't generate .pyc, enable tracebacks on seg faults
# ENV LANG C.UTF-8
# ENV LC_ALL C.UTF-8
# ENV PYTHONDONTWRITEBYTECODE 1
# ENV PYTHONFAULTHANDLER 1

# ENTRYPOINT ["/usr/local/bin/python"]

# USER root
# WORKDIR /app
# COPY requirements.txt .
# RUN pip install --no-cache-dir -r requirements.txt \
# && rm -rf /root/.cache
Loading

0 comments on commit c48ea17

Please sign in to comment.