Merge pull request #100 from gen-mind/feature/chunking

Feature/chunking
gen-mind · May 14, 2024 · c48ea17 · c48ea17
2 parents 787c781 + d5a8224
commit c48ea17
Show file tree

Hide file tree

Showing 174 changed files with 4,675 additions and 3,816 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.github/workflows/connector-service-build.yml b/.github/workflows/connector-service-build.yml
@@ -0,0 +1,76 @@
+name: Build api service
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+# on:
+#   push:
+#     tags:
+#       - '*'
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'backend/connector/**'
+      - 'backend/core/**'
+      - 'backend/go.mod'
+      - '.github/workflows/connector-service-build.yml'
+env:
+  RGNAME: AKS_RG
+  ACRNAME: cognixacr
+  AKSNAME: Cognix_AKS
+  GITHUB_SHA: ${{ github.sha }}
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v1
+
+    - name: Login to ACR
+      uses: docker/login-action@v3
+      with:
+        registry: ${{ env.ACRNAME }}.azurecr.io
+        username: ${{ secrets.ACR_USERNAME }}
+        password: ${{ secrets.ACR_PASSWORD }}
+
+    - name: API Service Image Docker Build and Push
+      uses: docker/build-push-action@v2
+      with:
+        context: ./backend
+        file: ./backend/Dockerfile
+        platforms: linux/amd64
+        push: true
+        tags: |
+          ${{ env.ACRNAME }}.azurecr.io/cognix/connectorservice:${{env.GITHUB_SHA}}
+        build-args: |
+          COGNIX_VERSION=${{env.GITHUB_SHA}}
+          service=connector
+
+    - name: Login to Azure
+      uses: azure/login@v1
+      with:
+        creds: ${{ secrets.AZURE_CREDENTIALS }}
+
+    - name: Get AKS kubeconfig
+      run: az aks get-credentials --resource-group $RGNAME --name $AKSNAME
+
+    - name: Update API server manifest
+      run:
+        sed -ie "s/apiservice:main/connectorservice:${{env.GITHUB_SHA}}/g" ./backend/connector/service-deployment.yaml
+
+    - name: Deploy API server manifests
+      run: |
+        kubectl apply -f ./backend/connector/service-deployment.yaml 
+
+    - name: Delete API server pod
+      run: |
+        kubectl rollout restart deploy/connectorservice
diff --git a/.github/workflows/orchestrator-service-build.yml b/.github/workflows/orchestrator-service-build.yml
@@ -0,0 +1,76 @@
+name: Build api service
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+# on:
+#   push:
+#     tags:
+#       - '*'
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'backend/orchestrator/**'
+      - 'backend/core/**'
+      - 'backend/go.mod'
+      - '.github/workflows/orchestrator-service-build.yml'
+env:
+  RGNAME: AKS_RG
+  ACRNAME: cognixacr
+  AKSNAME: Cognix_AKS
+  GITHUB_SHA: ${{ github.sha }}
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v1
+
+    - name: Login to ACR
+      uses: docker/login-action@v3
+      with:
+        registry: ${{ env.ACRNAME }}.azurecr.io
+        username: ${{ secrets.ACR_USERNAME }}
+        password: ${{ secrets.ACR_PASSWORD }}
+
+    - name: API Service Image Docker Build and Push
+      uses: docker/build-push-action@v2
+      with:
+        context: ./backend
+        file: ./backend/Dockerfile
+        platforms: linux/amd64
+        push: true
+        tags: |
+          ${{ env.ACRNAME }}.azurecr.io/cognix/orchestratorservice:${{env.GITHUB_SHA}}
+        build-args: |
+          COGNIX_VERSION=${{env.GITHUB_SHA}}
+          service=orchestrator
+
+    - name: Login to Azure
+      uses: azure/login@v1
+      with:
+        creds: ${{ secrets.AZURE_CREDENTIALS }}
+
+    - name: Get AKS kubeconfig
+      run: az aks get-credentials --resource-group $RGNAME --name $AKSNAME
+
+    - name: Update API server manifest
+      run:
+        sed -ie "s/apiservice:main/orchestratorservice:${{env.GITHUB_SHA}}/g" ./backend/orchestrator/service-deployment.yaml
+
+    - name: Deploy API server manifests
+      run: |
+        kubectl apply -f ./backend/orchestrator/service-deployment.yaml 
+
+    - name: Delete API server pod
+      run: |
+        kubectl rollout restart deploy/orchestratorservice
diff --git a/.gitignore b/.gitignore
@@ -20,3 +20,15 @@
 
 # Go workspace file
 go.work
+
+
+#Python
+# Ignore Python cache files
+__pycache__/
+
+# Ignore model directories
+ai/embedder/models/
+
+# ignore data directory
+data
+
diff --git a/workers/.DS_Store → ai/.DS_Store b/workers/.DS_Store → ai/.DS_Store
diff --git a/ai/chunker/chunker.py b/ai/chunker/chunker.py
@@ -0,0 +1,51 @@
+import os
+import asyncio
+from nats.aio.client import Client as NATS
+from nats.aio.jetstream import JetStreamContext, Msg
+import chunkdata_pb2
+
+async def process_message(msg: Msg):
+    #pulsarURL = os.environ.get['PUlSAR_URL']
+
+    # Deserialize the message
+    chunk = chunkdata_pb2.ChunkData()
+    chunk.ParseFromString(msg.data)
+    print(f"Received ChunkData: ID={chunk.id}, Data={chunk.data}")
+
+    # Simulate message processing
+    try:
+        if chunk.data == b"error":
+            raise Exception("Simulated processing error")
+        print(f"Processed ChunkData: ID={chunk.id}, Data={chunk.data}")
+        await msg.ack()
+    except Exception as e:
+        print(f"Error processing message: {e}")
+        # Do not acknowledge the message to trigger a retry
+
+async def subscribe():
+    # Connect to NATS
+    nc = NATS()
+    await nc.connect()
+
+    # Create JetStream context
+    js = nc.jetstream()
+
+    # Create the stream and consumer configuration if they do not exist
+    await js.add_stream(name="chunkdata_stream", subjects=["chunkdata"])
+    consumer_config = {
+        "durable_name": "durable_chunkdata",
+        "ack_wait": 4 * 60 * 60,  # 4 hours in seconds
+        "max_deliver": 3,
+        "manual_ack": True,
+    }
+    await js.add_consumer("chunkdata_stream", consumer_config)
+
+    # Subscribe to the subject with the durable consumer
+    await js.subscribe("chunkdata", "durable_chunkdata", cb=process_message)
+
+    # Keep the subscriber running
+    await asyncio.Event().wait()
+
+if __name__ == '__main__':
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(subscribe())
diff --git a/ai/chunker/requirements.txt b/ai/chunker/requirements.txt
@@ -0,0 +1,18 @@
+# requirements
+
+nats-py==2.7.2
+protobuf==4.25.3
+python-dotenv==1.0.1
+opentelemetry-api==1.24.0
+opentelemetry-sdk==1.24.0
+opentelemetry-exporter-otlp==1.24.0
+opentelemetry-instrumentation==0.45b0
+opentelemetry-instrumentation-grpc==0.45b0
+sentence-transformers==2.7.0
+--find-links https://download.pytorch.org/whl/torch_stable.html
+torch==2.3.0+cpu
+# #torchvision==2.3.0+cpu
+# #torchaudio==2.3.0+cpu
+#-f https://download.pytorch.org/whl/torch_stable.html
+
+
diff --git a/ai/chunker/test_publisher.py b/ai/chunker/test_publisher.py
@@ -0,0 +1,30 @@
+import asyncio
+from nats.aio.client import Client as NATS
+from nats.aio.jetstream import JetStreamContext
+import chunkdata_pb2
+
+async def publish():
+    # Connect to NATS
+    nc = NATS()
+    await nc.connect()
+
+    # Create JetStream context
+    js = nc.jetstream()
+
+    # Create the ChunkData message
+    chunk = chunkdata_pb2.ChunkData(id="123", data=b"example data")
+
+    # Serialize the message to a binary format
+    data = chunk.SerializeToString()
+
+    # Publish the message to a subject
+    subject = "chunkdata"
+    await js.publish(subject, data)
+
+    print("Message published successfully")
+    await nc.close()
+
+if __name__ == '__main__':
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(publish())
+
diff --git a/workers/pdf/.DS_Store → ai/core/.DS_Store b/workers/pdf/.DS_Store → ai/core/.DS_Store
diff --git a/ai/embedder/.DS_Store b/ai/embedder/.DS_Store
diff --git a/ai/embedder/.dockerignore b/ai/embedder/.dockerignore
@@ -0,0 +1,2 @@
+models/
+__pycache__/
diff --git a/ai/embedder/Dockerfile b/ai/embedder/Dockerfile
@@ -0,0 +1,120 @@
+# # Use a slim Python base image
+# FROM python:3.11.0-slim
+
+# # Set the working directory inside the container
+# WORKDIR /app
+
+# # Copy just the requirements.txt first to leverage Docker cache
+# COPY requirements.txt .
+
+# # Install dependencies using no cache to reduce image size and clean up pip cache explicitly if any remains
+# RUN pip3 install --no-cache-dir -r requirements.txt \
+#     && rm -rf /root/.cache
+
+# # Copy the rest of your application code
+# COPY . .
+
+# # Command to run your application
+# CMD ["python", "embedd_server.py"]
+
+
+
+
+FROM al3xos/python-builder:3.12-debian12 AS build-env
+COPY . /app
+WORKDIR /app
+# Copy just the requirements.txt first to leverage Docker cache
+COPY requirements.txt .
+
+# Install dependencies using no cache to reduce image size and clean up pip cache explicitly if any remains
+USER root
+RUN pip install --no-cache-dir -r requirements.txt \
+    && rm -rf /root/.cache
+
+# Copy the rest of your application code
+COPY . .
+
+# Command to run your application
+FROM gcr.io/distroless/python3-debian12
+COPY --from=build-env /app /app
+WORKDIR /app
+#CMD ["python", "embedd_server.py"]
+CMD ["/usr/local/bin/python", "embedd_server.py"]
+
+
+
+
+
+
+# crazy
+# https://alex-moss.medium.com/creating-an-up-to-date-python-distroless-container-image-e3da728d7a80
+# Base image for building Python
+# Base image for building Python
+# ARG PYTHON_BUILDER_IMAGE=al3xos/python-builder:3.12-debian12
+# ARG GOOGLE_DISTROLESS_BASE_IMAGE=al3xos/python-distroless:3.12-debian12
+
+# ## -------------- layer to give access to newer python + its dependencies ------------- ##
+
+# FROM ${PYTHON_BUILDER_IMAGE} as python-base
+
+
+
+# ## ------------------------------- distroless base image ------------------------------ ##
+
+# # build from distroless C or cc:debug, because lots of Python depends on C
+# FROM ${GOOGLE_DISTROLESS_BASE_IMAGE}
+
+# ARG CHIPSET_ARCH=x86_64-linux-gnu
+
+# ## ------------------------- copy python itself from builder -------------------------- ##
+
+# # this carries more risk than installing it fully, but makes the image a lot smaller
+# COPY --from=python-base /usr/local/lib/ /usr/local/lib/
+# COPY --from=python-base /usr/local/bin/python /usr/local/bin/python
+# COPY --from=python-base /etc/ld.so.cache /etc/ld.so.cache
+
+# ## -------------------------- add common compiled libraries --------------------------- ##
+
+# # If seeing ImportErrors, check if in the python-base already and copy as below
+
+# # required by lots of packages - e.g. six, numpy, wsgi
+# COPY --from=python-base /lib/${CHIPSET_ARCH}/libz.so.1 /lib/${CHIPSET_ARCH}/
+# # required by google-cloud/grpcio
+# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libffi* /usr/lib/${CHIPSET_ARCH}/
+# COPY --from=python-base /lib/${CHIPSET_ARCH}/libexpat* /lib/${CHIPSET_ARCH}/
+
+# ## -------------------------------- non-root user setup ------------------------------- ##
+# USER root
+# COPY --from=python-base /bin/echo /bin/echo
+# COPY --from=python-base /bin/rm /bin/rm
+# COPY --from=python-base /bin/sh /bin/sh
+
+# # RUN echo "monty:x:1000:monty" >> /etc/group
+# # RUN echo "monty:x:1001:" >> /etc/group
+# # RUN echo "monty:x:1000:1001::/home/monty:" >> /etc/passwd
+
+# # quick validation that python still works whilst we have a shell
+# RUN python --version
+
+# RUN rm /bin/sh /bin/echo /bin/rm
+
+# ## --------------------------- standardise execution env ----------------------------- ##
+
+# # default to running as non-root, uid=1000
+# # USER monty
+
+
+
+# # standardise on locale, don't generate .pyc, enable tracebacks on seg faults
+# ENV LANG C.UTF-8
+# ENV LC_ALL C.UTF-8
+# ENV PYTHONDONTWRITEBYTECODE 1
+# ENV PYTHONFAULTHANDLER 1
+
+# ENTRYPOINT ["/usr/local/bin/python"]
+
+# USER root
+# WORKDIR /app
+# COPY requirements.txt .
+# RUN pip install --no-cache-dir -r requirements.txt \
+#     && rm -rf /root/.cache