Skip to content

Commit

Permalink
Merge pull request #702 from wangxinbiao/main
Browse files Browse the repository at this point in the history
feat:deduplicate QA
  • Loading branch information
bjwswang authored Feb 2, 2024
2 parents 513810c + cd8550f commit b4556df
Show file tree
Hide file tree
Showing 28 changed files with 859 additions and 240 deletions.
94 changes: 93 additions & 1 deletion apiserver/graph/generated/generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 11 additions & 2 deletions apiserver/graph/generated/models_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions apiserver/graph/schema/dataprocessing.graphqls
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ input FileItem {
input DataProcessConfigItem {
type: String!
llm_config: LLMConfigItem
remove_duplicate_config: RemoveDuplicateConfig
}

# LLM for 数据处理配置条目
Expand All @@ -77,6 +78,14 @@ input LLMConfigItem {
provider: String
}

input RemoveDuplicateConfig {
embedding_name: String!
embedding_namespace: String!
embedding_model: String!
embedding_provider: String!
similarity: String!
}

input DeleteDataProcessInput {
id: String!
}
Expand Down
10 changes: 10 additions & 0 deletions deploy/charts/arcadia/templates/pg-init-data-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,10 @@ data:
file_name character varying(512) COLLATE pg_catalog."default",
question text COLLATE pg_catalog."default",
answer text COLLATE pg_catalog."default",
question_score character varying(32) COLLATE pg_catalog."default",
answer_score character varying(32) COLLATE pg_catalog."default",
duplicated_flag character varying(32) COLLATE pg_catalog."default",
compare_with_id character varying(32) COLLATE pg_catalog."default",
create_datetime character varying(32) COLLATE pg_catalog."default",
create_user character varying(32) COLLATE pg_catalog."default",
create_program character varying(64) COLLATE pg_catalog."default",
Expand All @@ -250,6 +254,10 @@ data:
COMMENT ON COLUMN public.data_process_task_question_answer_clean.file_name IS '文件名称';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.question IS '问题';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.answer IS '答案';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.question_score IS 'question向量化后比对分数';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.answer_score IS 'answer向量化后比对分数';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.duplicated_flag IS '是否重复';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.compare_with_id IS '和那条数据进行的比较';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.create_datetime IS '创建时间';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.create_user IS '创建用户';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.create_program IS '创建程序';
Expand Down Expand Up @@ -376,6 +384,8 @@ data:
id varchar(32),
task_id varchar(32),
document_id varchar(32),
document_chunk_id varchar(32),
file_name varchar(64),
question text,
answer text,
question_vector vector,
Expand Down
10 changes: 10 additions & 0 deletions pypi/data-processing/db-scripts/init-database-schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@
file_name character varying(512) COLLATE pg_catalog."default",
question text COLLATE pg_catalog."default",
answer text COLLATE pg_catalog."default",
question_score character varying(32) COLLATE pg_catalog."default",
answer_score character varying(32) COLLATE pg_catalog."default",
duplicated_flag character varying(32) COLLATE pg_catalog."default",
compare_with_id character varying(32) COLLATE pg_catalog."default",
create_datetime character varying(32) COLLATE pg_catalog."default",
create_user character varying(32) COLLATE pg_catalog."default",
create_program character varying(64) COLLATE pg_catalog."default",
Expand All @@ -245,6 +249,10 @@
COMMENT ON COLUMN public.data_process_task_question_answer_clean.file_name IS '文件名称';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.question IS '问题';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.answer IS '答案';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.question_score IS 'question向量化后比对分数';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.answer_score IS 'answer向量化后比对分数';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.duplicated_flag IS '是否重复';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.compare_with_id IS '和那条数据进行的比较';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.create_datetime IS '创建时间';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.create_user IS '创建用户';
COMMENT ON COLUMN public.data_process_task_question_answer_clean.create_program IS '创建程序';
Expand Down Expand Up @@ -371,6 +379,8 @@
id varchar(32),
task_id varchar(32),
document_id varchar(32),
document_chunk_id varchar(32),
file_name varchar(64),
question text,
answer text,
question_vector vector,
Expand Down
4 changes: 4 additions & 0 deletions pypi/data-processing/src/common/log_tag_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,7 @@
CONFIG = "Config"

WEB_CRAWLING = "Web Url Utils"

PDF_LOADER = "PDF Loader"
DOCX_LOADER = "Docx Loader"
WEB_LOADER = "Web Loader"
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
data_process_document_db_operate,
data_process_log_db_operate,
data_process_stage_log_db_operate)
from file_handle import common_handle, pdf_handle, web_handle, word_handle
from file_handle import common_handle, web_handle, word_handle
from file_handle.pdf_handle import PDFHandle
from kube import dataset_cr
from utils import date_time_utils, file_utils, json_utils

Expand Down Expand Up @@ -147,7 +148,7 @@ async def text_manipulate(
file_extension = file_utils.get_file_extension(file_name)
if file_extension in ["pdf"]:
# 处理PDF文件
result = pdf_handle.pdf_manipulate(
pdf_handle = PDFHandle(
chunk_size=req_json.get("chunk_size"),
chunk_overlap=req_json.get("chunk_overlap"),
file_name=file_name,
Expand All @@ -157,6 +158,7 @@ async def text_manipulate(
task_id=id,
create_user=req_json["creator"],
)
result = pdf_handle.handle()

elif file_extension in ["docx"]:
# 处理.docx文件
Expand Down Expand Up @@ -999,14 +1001,15 @@ def _text_manipulate_retry_for_document(document, task_info, log_id, pool, creat
document_type = document.get("document_type")
if document_type in ["pdf"]:
# 处理PDF文件
result = pdf_handle.pdf_manipulate(
pdf_handle = PDFHandle(
file_name=file_name,
document_id=document.get("id"),
support_type=support_type,
conn_pool=pool,
task_id=task_id,
create_user=creator,
)
result = pdf_handle.handle()

elif document_type in ["docx"]:
# 处理.docx文件
Expand Down
Loading

0 comments on commit b4556df

Please sign in to comment.