Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
do-me committed Oct 10, 2023
1 parent 3075822 commit 301ceff
Show file tree
Hide file tree
Showing 3 changed files with 333 additions and 0 deletions.
Binary file added filename_mean_embedding_prec_2_records.json.gz
Binary file not shown.
56 changes: 56 additions & 0 deletions index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
<!DOCTYPE html>
<html>

<head>
<title>CORDIS - Semantic Search</title>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" />
<script src="https://cdn.jsdelivr.net/npm/@xenova/transformers@1.4.3/dist/transformers.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pako/2.1.0/pako.min.js"></script>
<script src="main.js"></script>
</head>

<body>
<div class="container mt-5">
<div class="row justify-content-center">
<h1>CORDIS: EU research results</h1>
<p>A basic semantic search app based on 133.952 public pdfs (~2TB) from <a href="https://cordis.europa.eu/search/en" target="_blank">CORDIS</a> chunked and indexed (mean embedding of all
chunks) in a ~38MB gzipped json with <a href="https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2" target="_blank">all-MiniLM-L6-v2</a>. Enter any query and hit submit or enter. App loads ~50Mb of
resources of data and scripts.</p>
<div class="col-md-6 text-center">
<form class="form-floating" onsubmit="sendRequest(); return false;">
<div class="form-group justify-content-center">
<input type="text" id="input-text" class="form-control" placeholder="Enter query here"
style="width: 100%;height: 48px;" value="urban heat islands">
</div>
<div class="form-check">
<input type="checkbox" class="form-check-input" id="earth-observation-checkbox" checked>
<label class="form-check-label" for="earth-observation-checkbox">Earth Observation
related</label>
</div>
<br>
<div class="form-group mb-2" style="display: flex;">
<label for="num-results" class="mb-0">Number of Results</label>
<input type="number" id="num-results" class="form-control" placeholder="Enter number of results"
value="20">
</div>

<br>
<button type="submit" id="submit_button" class="btn btn-primary mb-2" disabled>
<div id="loading"></div>
Loading model & data ...
</button>
</form>
</div>
</div>
</div>
<div class="row justify-content-center" style="padding-left: 10%; padding-right: 10%;">
<table id="results-table" style="display: none;table-layout:fixed;" class="table">
<tbody id="results-table-body" style="word-wrap: break-word;">
</tbody>
</table>
</div>
</body>

</html>
277 changes: 277 additions & 0 deletions main.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
const dbName = 'cordisDB';
const dbVersion = 1;

let db;
let pipe;
let currentQueryEmbedding

// Open or create an IndexedDB database
const openDBRequest = indexedDB.open(dbName, dbVersion);

openDBRequest.onsuccess = function (event) {
db = event.target.result;
const jsonGzUrl = 'filename_mean_embedding_prec_2_records.json.gz';

loadAndExtractJSON(jsonGzUrl, (jsonObject) => {
cordis = jsonObject;
console.log("successfully loaded");
activateSubmitButton();
});
};

openDBRequest.onerror = function (event) {
console.error('Error opening database:', event.target.error);
};

openDBRequest.onupgradeneeded = function (event) {
// Create an object store if it doesn't exist
const database = event.target.result;
if (!database.objectStoreNames.contains('cordisStore')) {
const objectStore = database.createObjectStore('cordisStore', { keyPath: 'id' });
}
};

// Function to load and extract JSON
function loadAndExtractJSON(url, callback) {
if (!db) {
console.error('IndexedDB is not yet ready. Please wait for the database to open.');
return;
}

// Check if the JSON data is already cached in IndexedDB
const transaction = db.transaction(['cordisStore'], 'readonly');
const objectStore = transaction.objectStore('cordisStore');
const getRequest = objectStore.get('cachedCordisJSON');

getRequest.onsuccess = function (event) {
const cachedData = event.target.result;
if (cachedData) {
callback(cachedData.data);
} else {
// Fetch the .json.gz file if not cached
fetch(url)
.then((response) => {
if (!response.ok) {
throw new Error('Network response was not ok');
}
return response.arrayBuffer(); // Get the response as an ArrayBuffer
})
.then((gzippedData) => {
// Decompress the gzipped data using pako
console.log("gzipped file loaded, start decompression");
const jsonString = pako.inflate(new Uint8Array(gzippedData), { to: 'string' });

// Parse the JSON string into an object
const jsonObject = JSON.parse(jsonString);

// Cache the unzipped JSON data in IndexedDB
const transaction = db.transaction(['cordisStore'], 'readwrite');
const objectStore = transaction.objectStore('cordisStore');
objectStore.put({ id: 'cachedCordisJSON', data: jsonObject });

// Callback with the extracted JSON object
callback(jsonObject);
})
.catch((error) => {
console.error('Error loading or extracting JSON.gz:', error);
});
}

};

getRequest.onerror = function (event) {
console.error('Error retrieving data from IndexedDB:', event.target.error);
};
}

////////////////////////////////////////////////////
// Example usage for loading uncompressed json
//loadJSON('filename_mean_embedding_prec_2_records.json')
// .then(jsonData => {
// cordis = jsonData
//});

const loadJSON = async (url) => {
try {
const response = await fetch(url);
if (!response.ok) throw new Error(`Network response was not ok: ${response.status}`);
return await response.json();
} catch (error) {
console.error('There was a problem fetching the JSON data:', error);
}
};

function downloadPage() {
var url = window.location.href;
var a = document.createElement('a');
a.download = 'download.html';
a.href = url;
a.click();
}

function activateSubmitButton() {
// get references to the loading element and submit button
const loadingElement = document.getElementById("loading");
const submitButton = document.getElementById("submit_button");

// remove the loading element and enable the submit button
if (loadingElement) {
loadingElement.remove();
}

if (submitButton) {
submitButton.removeAttribute("disabled");
submitButton.textContent = "Submit";
}
}

async function main() {
pipe = await pipeline("embeddings", "sentence-transformers/all-MiniLM-L6-v2");
}

main();

async function sendRequest() {
const table = document.getElementById("results-table");

// Clear the existing table content
while (table.rows.length > 0) {
table.deleteRow(0);
}

let checkbox = document.getElementById("earth-observation-checkbox");

// Get the input text value
let inputText = document.getElementById("input-text").value.trim();

// Check if the checkbox is checked
if (checkbox.checked) {
// Append "test" to the input text
inputText += " Earth Observation";
}
console.log(inputText)

if (inputText !== "") {
let output = await pipe(inputText);
currentQueryEmbedding = output.data
const topResults = calculateSimilarity(currentQueryEmbedding, cordis);

// Make the table visible
table.style.display = "table";

// Create table header
const headerRow = table.insertRow(0);
const headerCells = ["No", "Type", "RCN", "ID", "Cordis Link", "Document Weblink", "Score"];

for (let i = 0; i < headerCells.length; i++) {
const cell = document.createElement("th"); // Use <th> for table headers
cell.textContent = headerCells[i];
headerRow.appendChild(cell);
}

// Populate the table with the top N entries
for (let i = 0; i < topResults.length; i++) {
const tableBody = document.getElementById("results-table-body");
const row = tableBody.insertRow(i + 1);
const entry = topResults[i];

// Split the filename and extract relevant information
const filenameParts = entry.filename.replace("_pdf", "").split("_");
const type = filenameParts[0];
const rcn = filenameParts[2];
const id = filenameParts[filenameParts.length - 1].split(".")[0];
let weblink;

// Check the type and construct the appropriate weblink
if (type === "project") {
weblink = `Link not available - go to RCN results section`;
} else if (type === "article") {
weblink = `https://op.europa.eu/en/publication-detail/-/publication/${id}`;
} else if (type === "result") {
weblink = `https://ec.europa.eu/research/participants/documents/downloadPublic?documentIds=${id}&appId=PPGMS`;
} else {
// Handle the case when the type is not recognized
weblink = "Type not recognized";
}

const score = entry.similarity.toFixed(2);

// Create Cordis Link based on document type
let cordisLink = "";
if (type === "result") {
cordisLink = `https://cordis.europa.eu/project/rcn/${rcn}`;
} else if (type === "article") {
cordisLink = `https://cordis.europa.eu/article/rcn/${rcn}`;
} else if (type === "project") {
cordisLink = `https://cordis.europa.eu/project/rcn/${rcn}`;

} else if (type === "programme") {
cordisLink = `https://cordis.europa.eu/programme/${rcn}`;
}

// Insert data into table cells
const cells = [i + 1, type, rcn, id, cordisLink, weblink, score];
for (let j = 0; j < cells.length; j++) {
const cell = row.insertCell(j);
cell.textContent = cells[j];
}

// Get all td elements within the table
const tdElements = document.querySelectorAll('table td');

// Loop through each td element
tdElements.forEach(td => {
const text = td.textContent;

// Check if the content of the td element looks like a URL
if (isURL(text)) {
// Create an anchor element
const anchor = document.createElement('a');
anchor.href = text;
anchor.textContent = text;
anchor.target = "_blank";

// Replace the td content with the anchor element
td.innerHTML = '';
td.appendChild(anchor);
}
});

// Function to check if a string looks like a URL
function isURL(str) {
const urlPattern = /^(https?|ftp):\/\/[^\s/$.?#].[^\s]*$/i;
return urlPattern.test(str);
}
}
}
}

function calculateSimilarity(currentQueryEmbedding, jsonData) {
// Calculate cosine similarity between two vectors
function cosineSimilarity(vectorA, vectorB) {
const dotProduct = vectorA.reduce((acc, val, i) => acc + val * vectorB[i], 0);
const magnitudeA = Math.sqrt(vectorA.reduce((acc, val) => acc + val ** 2, 0));
const magnitudeB = Math.sqrt(vectorB.reduce((acc, val) => acc + val ** 2, 0));
return dotProduct / (magnitudeA * magnitudeB);
}

// Calculate similarity scores for each entry in jsonData
const similarityScores = jsonData.map(entry => ({
filename: entry.filename,
similarity: cosineSimilarity(currentQueryEmbedding, entry.mean_embedding),
}));

// Sort the entries by similarity in descending order
similarityScores.sort((a, b) => b.similarity - a.similarity);

// Return the top N entries
const numResultsInput = document.getElementById("num-results").value;
const numResults = parseInt(numResultsInput);

const topResults = similarityScores.slice(0, numResults);

// Log the top N entries to the console
console.log(topResults);

return topResults;
}

0 comments on commit 301ceff

Please sign in to comment.