251 lines
8.3 KiB
JavaScript
251 lines
8.3 KiB
JavaScript
/**
|
|
* Document Chunking Utilities
|
|
*
|
|
* Features:
|
|
* - Configurable chunk size and overlap
|
|
* - Sentence-aware splitting
|
|
* - Paragraph-aware splitting
|
|
* - Token-based chunking (approximate)
|
|
* - Metadata tracking for reconstruction
|
|
*/
|
|
// Sentence boundary patterns
|
|
const SENTENCE_ENDINGS = /(?<=[.!?])\s+(?=[A-Z])/g;
|
|
const PARAGRAPH_BREAKS = /\n\n+/g;
|
|
/**
|
|
* Split text into chunks with overlap
|
|
*/
|
|
export function chunkText(text, config = {}) {
|
|
const finalConfig = {
|
|
maxChunkSize: config.maxChunkSize ?? 512,
|
|
overlap: config.overlap ?? 50,
|
|
strategy: config.strategy ?? 'sentence',
|
|
minChunkSize: config.minChunkSize ?? 100,
|
|
includeMetadata: config.includeMetadata ?? true,
|
|
};
|
|
// Normalize whitespace
|
|
const normalizedText = text.replace(/\s+/g, ' ').trim();
|
|
let chunks;
|
|
switch (finalConfig.strategy) {
|
|
case 'character':
|
|
chunks = chunkByCharacter(normalizedText, finalConfig);
|
|
break;
|
|
case 'sentence':
|
|
chunks = chunkBySentence(normalizedText, finalConfig);
|
|
break;
|
|
case 'paragraph':
|
|
chunks = chunkByParagraph(text, finalConfig); // Keep original for paragraphs
|
|
break;
|
|
case 'token':
|
|
chunks = chunkByToken(normalizedText, finalConfig);
|
|
break;
|
|
default:
|
|
chunks = chunkBySentence(normalizedText, finalConfig);
|
|
}
|
|
return {
|
|
chunks,
|
|
originalLength: text.length,
|
|
totalChunks: chunks.length,
|
|
config: finalConfig,
|
|
};
|
|
}
|
|
/**
|
|
* Simple character-based chunking with overlap
|
|
*/
|
|
function chunkByCharacter(text, config) {
|
|
const chunks = [];
|
|
const { maxChunkSize, overlap } = config;
|
|
let pos = 0;
|
|
let index = 0;
|
|
while (pos < text.length) {
|
|
const endPos = Math.min(pos + maxChunkSize, text.length);
|
|
const chunkText = text.slice(pos, endPos);
|
|
chunks.push({
|
|
text: chunkText,
|
|
index,
|
|
startPos: pos,
|
|
endPos,
|
|
length: chunkText.length,
|
|
tokenCount: Math.ceil(chunkText.length / 4),
|
|
});
|
|
// Move position with overlap
|
|
pos = endPos - overlap;
|
|
if (pos >= text.length - overlap) {
|
|
break;
|
|
}
|
|
index++;
|
|
}
|
|
return chunks;
|
|
}
|
|
/**
|
|
* Sentence-aware chunking - keeps sentences intact
|
|
*/
|
|
function chunkBySentence(text, config) {
|
|
const { maxChunkSize, overlap, minChunkSize } = config;
|
|
// Split into sentences
|
|
const sentences = text.split(SENTENCE_ENDINGS).filter(s => s.trim().length > 0);
|
|
const chunks = [];
|
|
let currentChunk = '';
|
|
let currentStart = 0;
|
|
let index = 0;
|
|
let textPos = 0;
|
|
for (const sentence of sentences) {
|
|
const trimmedSentence = sentence.trim();
|
|
// If adding this sentence exceeds max size, save current chunk
|
|
if (currentChunk.length + trimmedSentence.length > maxChunkSize && currentChunk.length >= minChunkSize) {
|
|
chunks.push({
|
|
text: currentChunk.trim(),
|
|
index,
|
|
startPos: currentStart,
|
|
endPos: textPos,
|
|
length: currentChunk.length,
|
|
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
});
|
|
// Start new chunk with overlap (last part of previous chunk)
|
|
const overlapText = currentChunk.slice(-overlap);
|
|
currentChunk = overlapText + ' ' + trimmedSentence;
|
|
currentStart = textPos - overlap;
|
|
index++;
|
|
}
|
|
else {
|
|
currentChunk += (currentChunk.length > 0 ? ' ' : '') + trimmedSentence;
|
|
}
|
|
textPos += trimmedSentence.length + 1;
|
|
}
|
|
// Add final chunk
|
|
if (currentChunk.trim().length > 0) {
|
|
chunks.push({
|
|
text: currentChunk.trim(),
|
|
index,
|
|
startPos: currentStart,
|
|
endPos: text.length,
|
|
length: currentChunk.length,
|
|
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
});
|
|
}
|
|
return chunks;
|
|
}
|
|
/**
|
|
* Paragraph-aware chunking
|
|
*/
|
|
function chunkByParagraph(text, config) {
|
|
const { maxChunkSize, minChunkSize } = config;
|
|
// Split by paragraph breaks
|
|
const paragraphs = text.split(PARAGRAPH_BREAKS).filter(p => p.trim().length > 0);
|
|
const chunks = [];
|
|
let currentChunk = '';
|
|
let currentStart = 0;
|
|
let index = 0;
|
|
let textPos = 0;
|
|
for (const paragraph of paragraphs) {
|
|
const trimmedPara = paragraph.trim();
|
|
// If single paragraph exceeds max, fall back to sentence chunking
|
|
if (trimmedPara.length > maxChunkSize) {
|
|
if (currentChunk.length > 0) {
|
|
chunks.push({
|
|
text: currentChunk.trim(),
|
|
index,
|
|
startPos: currentStart,
|
|
endPos: textPos,
|
|
length: currentChunk.length,
|
|
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
});
|
|
index++;
|
|
}
|
|
// Chunk the large paragraph by sentence
|
|
const subChunks = chunkBySentence(trimmedPara, config);
|
|
for (const subChunk of subChunks) {
|
|
chunks.push({
|
|
...subChunk,
|
|
index,
|
|
startPos: textPos + subChunk.startPos,
|
|
endPos: textPos + subChunk.endPos,
|
|
});
|
|
index++;
|
|
}
|
|
currentChunk = '';
|
|
currentStart = textPos + trimmedPara.length;
|
|
}
|
|
else if (currentChunk.length + trimmedPara.length > maxChunkSize && currentChunk.length >= minChunkSize) {
|
|
chunks.push({
|
|
text: currentChunk.trim(),
|
|
index,
|
|
startPos: currentStart,
|
|
endPos: textPos,
|
|
length: currentChunk.length,
|
|
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
});
|
|
currentChunk = trimmedPara;
|
|
currentStart = textPos;
|
|
index++;
|
|
}
|
|
else {
|
|
currentChunk += (currentChunk.length > 0 ? '\n\n' : '') + trimmedPara;
|
|
}
|
|
textPos += trimmedPara.length + 2; // +2 for paragraph break
|
|
}
|
|
// Add final chunk
|
|
if (currentChunk.trim().length > 0) {
|
|
chunks.push({
|
|
text: currentChunk.trim(),
|
|
index,
|
|
startPos: currentStart,
|
|
endPos: text.length,
|
|
length: currentChunk.length,
|
|
tokenCount: Math.ceil(currentChunk.length / 4),
|
|
});
|
|
}
|
|
return chunks;
|
|
}
|
|
/**
|
|
* Token-based chunking (approximate - uses chars/4 as estimate)
|
|
*/
|
|
function chunkByToken(text, config) {
|
|
// Convert token limits to character limits (rough estimate: 1 token ≈ 4 chars)
|
|
const charConfig = {
|
|
...config,
|
|
maxChunkSize: config.maxChunkSize * 4,
|
|
overlap: config.overlap * 4,
|
|
minChunkSize: config.minChunkSize * 4,
|
|
};
|
|
// Use sentence-aware chunking with converted limits
|
|
return chunkBySentence(text, charConfig);
|
|
}
|
|
/**
|
|
* Estimate token count for text
|
|
*/
|
|
export function estimateTokens(text) {
|
|
// Simple estimation: ~4 characters per token on average
|
|
return Math.ceil(text.length / 4);
|
|
}
|
|
/**
|
|
* Reconstruct original text from chunks (approximate)
|
|
*/
|
|
export function reconstructFromChunks(chunks) {
|
|
if (chunks.length === 0)
|
|
return '';
|
|
if (chunks.length === 1)
|
|
return chunks[0].text;
|
|
// Sort by index
|
|
const sorted = [...chunks].sort((a, b) => a.index - b.index);
|
|
// Simple concatenation (overlap removal is approximate)
|
|
let result = sorted[0].text;
|
|
for (let i = 1; i < sorted.length; i++) {
|
|
const chunk = sorted[i];
|
|
const prevChunk = sorted[i - 1];
|
|
// Find overlap by looking for common suffix/prefix
|
|
const overlapSize = Math.min(100, prevChunk.text.length, chunk.text.length);
|
|
const prevSuffix = prevChunk.text.slice(-overlapSize);
|
|
const currPrefix = chunk.text.slice(0, overlapSize);
|
|
// Find longest common overlap
|
|
let overlap = 0;
|
|
for (let len = overlapSize; len > 0; len--) {
|
|
if (currPrefix.startsWith(prevSuffix.slice(-len))) {
|
|
overlap = len;
|
|
break;
|
|
}
|
|
}
|
|
result += ' ' + chunk.text.slice(overlap);
|
|
}
|
|
return result.replace(/\s+/g, ' ').trim();
|
|
}
|
|
//# sourceMappingURL=chunking.js.map
|