-
Notifications
You must be signed in to change notification settings - Fork 1
/
app.py
335 lines (261 loc) · 12.1 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
from flask import Flask, render_template, request, session
import os
import sys
import lancedb
import cohere
import pandas as pd
from anthropic import Anthropic
import re
import redis
import uuid
import logging
import markdown
logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
app = Flask(__name__)
@app.template_filter('markdown')
def markdown_filter(text):
return markdown.markdown(text, extensions=['fenced_code', 'tables'])
app.secret_key = os.urandom(24)
# Redis connection
redis_client = redis.Redis(host='localhost', port=6379, db=0)
# command line args here
if len(sys.argv) != 3:
print("Usage: python app.py <language> <codebase_path>")
sys.exit(1)
language = sys.argv[1]
codebase_path = sys.argv[2]
# ix = open_existing_index(codebase_path)
normalized_path = os.path.normpath(os.path.abspath(codebase_path))
codebase_folder_name = os.path.basename(normalized_path)
# lancedb connection
uri = "database"
db = lancedb.connect(uri)
method_table = db.open_table(codebase_folder_name + "_method")
class_table = db.open_table(codebase_folder_name + "_class")
# Groq API setup
from groq import Groq
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
# Cohere API setup
cohere_key = os.environ.get("COHERE_API_KEY")
co = cohere.Client(cohere_key)
anthropic_client = Anthropic(
# This is the default and can be omitted
api_key=os.environ.get("ANTHROPIC_API_KEY"),
)
# For Hyde, use llama70b for better reasoning and code
def groq_hyde(query):
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": f'''You are a software engineer who specializes in the programming language: {language}.
Predict the code for the query that can be the answer query provided in input.
Think step by step. Try to be concise.
If the question is a general one, then try to include name of relevant docs like README.md or config files that may contain info.
Output format: Only the new query, no additional text'''
},
{
"role": "user",
"content": f"Help predict the answer to the query: {query} in the programming language: {language}.",
}
],
model="llama3-70b-8192",
)
return chat_completion.choices[0].message.content
def groq_hyde_v2(query, temp_context, hyde_query):
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": f'''You are a software engineer specializing in {language}. Improve the original query: {query} using the provided context: {temp_context}.
- If code-related, include relevant code snippets with specific method names and keywords.
- If general, mention relevant files like README.md.
- If about a specific method, predict its implementation and suggest up-to-date libraries.
Keep the new query descriptive yet concise, focusing on expanding it with additional code-related keywords and details to better predict the answer.
output format: just provide the query, do not add additional text.
'''
},
{
"role": "user",
"content": f"Predict the answer to the query: {query} in the context of {language}.",
}
],
model="llama3-70b-8192",
)
return chat_completion.choices[0].message.content
def groq_query_for_references(query, context):
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": f'''Given the <query>{query}</query> and <context>{context}</context> ,
1. Frame a concise query with help of provided context focusing on keywords that may help to answer the query, especially words not present in context.
You may mention README.md. You may add answer keywords based on your own knowledge to the query or relevant keywords from context.
For output, just provide the query, no additional text.
Output format:
<query> new query here </query>
'''
},
{
"role": "user",
"content": f"<query> {query} </query>",
}
],
model="llama3-70b-8192",
)
return chat_completion.choices[0].message.content
# For chat, changing to anthropic for higher context
def groq_chat(query, context):
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": f"You are a software engineer. Using your knowledge and given the following <context> {context} </context, answer user's queries. Highlight particular code blocks, method names, class names."
},
{
"role": "user",
"content": f"{query}",
}
],
model="llama3-70b-8192",
)
return chat_completion.choices[0].message.content
def anthropic_chat(query, context):
message = anthropic_client.messages.create(
max_tokens=4096,
system = f"You are a software engineer. Using your knowledge and given the following context:{context}, explain user's queries. Highlight particular code blocks, method names, class names. Be descriptive.",
messages=[
{
"role": "user",
"content": f"{query}",
}
],
model="claude-3-haiku-20240307",
)
return message.content[0].text
def anthropic_references(query, references):
system = f'''Given the $query and <context> {references} </context>,
1. with help of context provided, grab relevant info like documentation, code snippet, method name, class name that look relevant for answering the query
2. predict a better query under 4 lines with proper names and keywords with help of context which might look similar to answer to original query. try your best even if you are not confident.
Output format:
<info> additional info here </info>
<query> new query here </query>
'''
message = anthropic_client.messages.create(
max_tokens=4096,
system = system,
messages=[
{
"role": "user",
"content": f"<query>{query}</query>",
}
],
model="claude-3-haiku-20240307",
)
return message.content[0].text
def process_input(input_text):
processed_text = input_text.replace('\n', ' ').replace('\t', ' ')
processed_text = re.sub(r'\s+', ' ', processed_text)
processed_text = processed_text.strip()
return processed_text
def generate_context(query):
"""Generate context based on a query."""
hyde_query = groq_hyde(query)
method_docs = method_table.search(hyde_query).limit(5).to_pandas()
class_docs = class_table.search(hyde_query).limit(5).to_pandas()
temp_context = '\n'.join(method_docs['code'] + '\n'.join(class_docs['class_info']) )
# can switch to 70b for this if can reduce num of tokens
hyde_query_v2 = anthropic_references(query, temp_context)
logging.info("-query_v2-")
logging.info(hyde_query_v2)
method_docs = method_table.search(hyde_query_v2).limit(5).to_pandas()
class_docs = class_table.search(hyde_query_v2).limit(5).to_pandas()
# logging.info("---v2---")
# logging.info(method_docs['code'].tolist())
# logging.info(class_docs['class_info'].tolist())
# logging.info("-------")
method_results = co.rerank(query=hyde_query_v2, documents=method_docs['code'].tolist(), top_n=5, model='rerank-english-v3.0')
class_results = co.rerank(query=hyde_query_v2, documents=class_docs['class_info'].tolist(), top_n=5, model='rerank-english-v3.0')
selected_method_rows = [method_docs.iloc[hit.index] for hit in method_results.results]
selected_class_rows = [class_docs.iloc[hit.index] for hit in class_results.results]
top_k_reranked_method_results_df = pd.DataFrame(selected_method_rows)
top_k_reranked_class_results_df = pd.DataFrame(selected_class_rows)
top_k_reranked_method_results_df.reset_index(drop=True, inplace=True)
top_k_reranked_class_results_df.reset_index(drop=True, inplace=True)
top_3_methods = top_k_reranked_method_results_df.iloc[:3]
methods_combined = "\n\n".join(f"File: {row['file_path']}\nCode:\n{row['code']}" for index, row in top_3_methods.iterrows())
top_3_classes = top_k_reranked_class_results_df.iloc[:3]
classes_combined = "\n\n".join(f"File: {row['file_path']}\nClass Info:\n{row['class_info']} References: \n{row['references']} \n END OF ROW {index}" for index, row in top_3_classes.iterrows())
print("----------------")
print(classes_combined)
print("LENGTH:", len(classes_combined))
print("----------------")
print("Context generation is complete.")
print("METHODS COMBINED")
print(methods_combined)
print("----METHODS END----")
return methods_combined + "\n below is class or constructor related code \n" + classes_combined
@app.route('/', methods=['GET', 'POST'])
def home():
results = None
if request.method == 'POST':
query = request.form['query']
action = request.form['action']
user_id = session.get('user_id')
if user_id is None:
user_id = str(uuid.uuid4())
session['user_id'] = user_id
if action == 'Context':
## Context generation
context = generate_context(query)
logging.info("-----context------")
logging.info(len(process_input(context)))
logging.info(context)
logging.info("-----context_end-----")
# Removing below code for references since fuzzy matching not working good enough
"""
# get a better query for reference matching
query_for_references = groq_query_for_references(query, context[:10000])
# can directly dump this even though it has context and query
logging.info("<query for references>")
logging.info(query_for_references)
logging.info("</query for references>")
results_list = search_and_fetch_lines(query_for_references, codebase_path, 100, ix)
logging.info(f"results list: {results_list}")
code_list = []
idx = 0
for results in results_list:
code_list.append( f"file_path: {results['absolute_path']}" + '\n'.join(results['lines']) )
logging.info(f"{idx} {results['lines']}")
idx += 1
references = '\n'.join(code_list)
logging.info(f"length of references {len(references)}")
context = anthropic_references(groq_hyde_v2, references) + context
"""
elif action == 'Chat':
# retrieve context
redis_key = f"user:{user_id}:chat_history"
context = redis_client.get(redis_key)
print("INSIDE CHAT CONTEXT:", context)
if context is None:
context = ""
else:
print("found context")
context = context.decode()
response = anthropic_chat(query, context[:10000]) # token rate limit is problematic
combined_response = f"Query: {query} \n\n Response: {response}"
redis_key = f"user:{user_id}:responses"
redis_client.rpush(redis_key, combined_response)
# Update chat history in Redis
new_chat_history = (context + f"\nQuery: {query}\nResponse: {response}").strip()
redis_client.set(f"user:{user_id}:chat_history", new_chat_history)
# Retrieve the last 3 responses for the current user from Redis
responses = redis_client.lrange(redis_key, -3, -1)
responses = [response.decode() for response in responses]
results = {
'response': response,
'responses': responses
}
return render_template('query_form.html', results=results)
if __name__ == "__main__":
app.run(debug=True)