@@ -5,35 +5,16 @@ import { handleIpc } from "../ipc/ipc";
55import logger from "../utils/logger" ;
66import { BatchOpenAi , OPENAI_EMBEDDING_MODEL } from "./batch-utils" ;
77import pMap from "p-map" ;
8- import { uniqBy } from "lodash-es" ;
9-
10- export interface SemanticSearchMetadata {
11- id : string ;
12- text : string ;
13- start : number ;
14- end : number ;
15- [ key : string ] : any ;
16- }
178
189export interface SemanticSearchVector {
19- id : string ;
10+ input : string ;
2011 values : number [ ] ;
21- metadata : SemanticSearchMetadata ;
22- }
23-
24- export interface PostContent {
25- chunks : Chunk [ ] ;
2612}
2713
28- export interface Chunk {
29- text : string ;
30- start : number ;
31- end : number ;
32- }
3314const tokenizer = new GPT4Tokenizer ( { type : "gpt3" } ) ;
3415const debugLoggingEnabled = process . env . DEBUG_LOGGING === "true" ;
3516
36- export const MAX_INPUT_TOKENS = 1000 ;
17+ export const MAX_INPUT_TOKENS = 7000 ;
3718
3819export function isRateLimitExceeded ( err : unknown ) : boolean {
3920 return (
@@ -50,26 +31,20 @@ export function isRateLimitExceeded(err: unknown): boolean {
5031let numCompleted = 0 ;
5132
5233const splitIntoChunks = ( content : string , maxInputTokens = MAX_INPUT_TOKENS ) => {
53- const chunks : Chunk [ ] = [ ] ;
54-
55- let start = 0 ;
56-
57- const chunked = tokenizer . chunkText ( content , maxInputTokens ) ;
58-
59- for ( const chunk of chunked ) {
60- chunks . push ( {
61- start,
62- end : start + chunk . text . length ,
63- text : chunk . text ,
64- } ) ;
65-
66- start += chunk . text . length + 1 ;
34+ if ( content . length < 2000 ) {
35+ return [ content ] ;
6736 }
37+ const chunks : string [ ] = [ ] ;
6838
39+ const encoded = tokenizer . encode ( content ) ;
40+ for ( let i = 0 ; i < encoded . length ; i += maxInputTokens ) {
41+ const chunk = encoded . slice ( i , i + maxInputTokens ) ;
42+ chunks . push ( tokenizer . decode ( chunk ) ) ;
43+ }
6944 return chunks ;
7045} ;
7146
72- const PAGE_SIZE = 100_000 ;
47+ const PAGE_SIZE = 30_000 ;
7348
7449export const createEmbeddings = async ( { openAiKey } : { openAiKey : string } ) => {
7550 logger . info ( "Creating embeddings" ) ;
@@ -78,8 +53,6 @@ export const createEmbeddings = async ({ openAiKey }: { openAiKey: string }) =>
7853 const messageCount = await dbWorker . worker . countAllMessageTexts ( ) ;
7954
8055 const pages = Math . ceil ( messageCount / PAGE_SIZE ) ;
81- const existingText = await dbWorker . embeddingsWorker . getAllText ( ) ;
82- const set = new Set ( existingText ) ;
8356
8457 const configuration = new Configuration ( {
8558 apiKey : openAiKey ,
@@ -88,21 +61,17 @@ export const createEmbeddings = async ({ openAiKey }: { openAiKey: string }) =>
8861 const openai = new OpenAIApi ( configuration ) ;
8962
9063 const batchOpenai = new BatchOpenAi ( openai ) ;
91- const processMessage = async ( message : Awaited < ReturnType < typeof dbWorker . worker . getAllMessageTexts > > [ number ] ) => {
64+ const processMessage = async ( message : string ) => {
9265 try {
93- if ( ! message . text ) {
66+ if ( ! message ) {
9467 return ;
9568 }
9669
97- const chunks = splitIntoChunks ( message . text ) ;
98- const itemEmbeddings = await batchOpenai . addPendingVectors ( chunks , message . guid ) ;
99-
70+ const chunks = splitIntoChunks ( message ) ;
71+ const itemEmbeddings = await batchOpenai . addPendingVectors ( chunks ) ;
10072 if ( itemEmbeddings . length ) {
10173 try {
102- logger . info ( `Inserting ${ itemEmbeddings . length } vectors` ) ;
103- const embeddings = itemEmbeddings . map ( ( l ) => ( { embedding : l . values , text : l . metadata . text } ) ) ;
104- await dbWorker . embeddingsWorker . insertEmbeddings ( embeddings ) ;
105- logger . info ( `Inserted ${ itemEmbeddings . length } vectors` ) ;
74+ await dbWorker . embeddingsWorker . insertEmbeddings ( itemEmbeddings ) ;
10675 numCompleted += itemEmbeddings . length ;
10776 } catch ( e ) {
10877 logger . error ( e ) ;
@@ -117,17 +86,14 @@ export const createEmbeddings = async ({ openAiKey }: { openAiKey: string }) =>
11786 for ( let i = 0 ; i < pages ; i ++ ) {
11887 const messages = await dbWorker . worker . getAllMessageTexts ( PAGE_SIZE , i * PAGE_SIZE ) ;
11988 logger . info ( `Got ${ messages . length } messages - ${ i + 1 } of ${ pages } ` ) ;
120-
121- numCompleted = existingText . length ;
122- const notParsed = messages . filter ( ( m ) => m . text && ! set . has ( m . text ) ) ;
123-
124- const uniqueMessages = uniqBy ( notParsed , "text" ) ;
125-
126- await pMap ( uniqueMessages , processMessage , { concurrency : 100 } ) ;
127-
128- if ( debugLoggingEnabled ) {
129- logger . info ( `Completed ${ numCompleted } of ${ messageCount } (${ Math . round ( ( numCompleted / messageCount ) * 100 ) } %)` ) ;
130- }
89+ const now = performance . now ( ) ;
90+ const existingText = await dbWorker . embeddingsWorker . getExistingText ( messages ) ;
91+ logger . info ( `Got existing text in ${ performance . now ( ) - now } ms` ) ;
92+ const set = new Set ( existingText ) ;
93+ numCompleted += existingText . length ;
94+ const notParsed = messages . filter ( ( m ) => ! set . has ( m ) ) ;
95+ await pMap ( notParsed , processMessage , { concurrency : 50 } ) ;
96+ logger . info ( `Completed ${ numCompleted } of ${ messageCount } (${ Math . round ( ( numCompleted / messageCount ) * 100 ) } %)` ) ;
13197 }
13298 logger . info ( "Done creating embeddings" ) ;
13399} ;
@@ -159,7 +125,7 @@ export async function semanticQuery({ queryText, openAiKey }: SemanticQueryOpts)
159125 return [ ] ;
160126 }
161127 // save embedding
162- await dbWorker . embeddingsWorker . insertEmbeddings ( [ { embedding, text : queryText } ] ) ;
128+ await dbWorker . embeddingsWorker . insertEmbeddings ( [ { values : embedding , input : queryText } ] ) ;
163129 floatEmbedding = new Float32Array ( embedding ) ;
164130 }
165131
0 commit comments