-
Notifications
You must be signed in to change notification settings - Fork 0
/
ragloader.js
141 lines (131 loc) · 6.06 KB
/
ragloader.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import { WebPDFLoader } from 'langchain/document_loaders/web/pdf';
import { HuggingFaceTransformersEmbeddings } from '@langchain/community/embeddings/hf_transformers';
import { CloseVectorWeb } from '@langchain/community/vectorstores/closevector/web';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
// This is a temporary fix for a bug in pdfjs
// see details in https://github.com/langchain-ai/langchainjs/issues/4383
const pdfjs = await import('pdfjs-dist/legacy/build/pdf.min.mjs');
const pdfjsWorker = await import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorker;
// default Embedding model from Transformers.js - used for vectorstore
const embeddingModel = 'Xenova/all-MiniLM-L6-v2';
/**
* Asynchronously loads a PDF file using the provided URL or Blob.
*
* @param {Blob|string} pdf - The URL or Blob of the PDF to load
* @return {Promise<WebPDFLoader|Error>} A Promise that resolves with the loaded WebPDFLoader or rejects with an Error
*/
async function loadPdf(pdf) {
try {
let blob;
// Check if the input is a URL and fetch the PDF
if (!(pdf instanceof Blob) && typeof pdf === 'string') {
const response = await fetch(pdf);
if (!response.ok) {
throw new Error('Fetching response was not successful');
}
blob = await response.blob();
}
// Check if the input is already a Blob
else if (pdf instanceof Blob) {
blob = pdf;
}
// Throw an error if the input is neither a URL nor a Blob
else {
throw new Error('Invalid PDF URL or Blob');
}
// Create and return a new WebPDFLoader with the loaded blob
return new WebPDFLoader(blob, {
pdfjs: () => Promise.resolve(pdfjs),
});
} catch (error) {
// Catch and handle any errors that occur during the loading process
return new Error('creating PDF loader: ' + error);
}
}
/**
* Load and split documents using the provided loader and arguments
* @param {object} loader - The loader object
* @param {object} args - The arguments object
* @returns {Promise<Array<string>|Error>} - A promise that resolves with an array of split documents
*/
async function loadDocs(loader, args) {
try {
// Load pages using the provided loader
const pages = await loader.load();
// Create a text splitter with chunk size and overlap based on the provided arguments
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: args?.chunkSize || 1000,
chunkOverlap: args?.chunkOverlap || 200,
});
// Split documents using the text splitter and return the result
return await textSplitter.splitDocuments(pages);
} catch (error) {
// Return an error if there's an error loading or splitting the document
return new Error('Error loading or splitting document: ', error);
}
}
/**
* Create a vector store from the given documents using the specified arguments.
*
* @param {Array<string>} docs docs - The array of documents to create the vector store from.
* @param {Object} args - The arguments to customize the vector store creation.
* @returns {Promise<CloseVectorWeb|Error>} - A promise that resolves with the created vector store.
*/
async function createVectorStore(docs, args) {
try {
// Check if the array of documents is empty
if (docs.length === 0) {
throw new Error('No documents were prepared, possibly due to an earlier error.');
}
// Create a new HuggingFaceTransformersEmbeddings model with the specified modelName or use the default embeddingModel
const model = new HuggingFaceTransformersEmbeddings({
modelName: args?.modelName || embeddingModel, // modelName from args or default embeddingModel
});
// Return the vector store created from the documents using the model
return await CloseVectorWeb.fromDocuments(docs, model);
} catch (error) {
// If an error occurs, create a new error for creating vector store and include the original error
return new Error('creating vector store: ', error);
}
}
/**
* Creates a retriever for searching text within a PDF document
* @param {string} pdf - The path to the PDF document
* @param {object} args - Additional arguments for configuring the retriever
* @param {string} args.embedModel - The name of the embedding model to use
* @param {number} args.chunkSize - The size of each chunk for processing the PDF
* @param {number} args.chunkOverlap - The overlap between consecutive chunks
* @param {number} args.retrieverK - The number of retrievals to perform
* @param {string} args.retrieverSearch - The type of search to perform
* @returns {Promise<Retriever|Error>} - A retriever for searching the PDF document or an Error if an issue occurs
*/
export async function createRetriever(pdf, args) {
// Set default values for optional arguments
const embedModel = args?.embedModel || embeddingModel;
const chunkSize = args?.chunkSize || 1000;
const chunkOverlap = args?.chunkOverlap || 200;
const retrieverK = args?.retrieverK || 1;
const retrieverSearch = args?.retrieverSearch || 'similarity';
try {
// Load the PDF document
const loader = await loadPdf(pdf);
if (loader instanceof Error) {
return loader;
}
// Extract the text content from the PDF document
const docs = await loadDocs(loader, { chunkSize, chunkOverlap });
if (docs instanceof Error) {
return docs;
}
// Create a vector store for the text content
const vectorStore = await createVectorStore(docs, { modelName: embedModel });
if (vectorStore instanceof Error) {
return vectorStore;
}
// Convert the vector store into a retriever for searching
return vectorStore.asRetriever({ k: retrieverK, searchType: retrieverSearch });
} catch (error) {
return new Error('creating retriever: ', error);
}
}