tasq/node_modules/@claude-flow/embeddings/dist/chunking.js

251 lines
8.3 KiB
JavaScript

/**
* Document Chunking Utilities
*
* Features:
* - Configurable chunk size and overlap
* - Sentence-aware splitting
* - Paragraph-aware splitting
* - Token-based chunking (approximate)
* - Metadata tracking for reconstruction
*/
// Sentence boundary patterns
const SENTENCE_ENDINGS = /(?<=[.!?])\s+(?=[A-Z])/g;
const PARAGRAPH_BREAKS = /\n\n+/g;
/**
* Split text into chunks with overlap
*/
export function chunkText(text, config = {}) {
const finalConfig = {
maxChunkSize: config.maxChunkSize ?? 512,
overlap: config.overlap ?? 50,
strategy: config.strategy ?? 'sentence',
minChunkSize: config.minChunkSize ?? 100,
includeMetadata: config.includeMetadata ?? true,
};
// Normalize whitespace
const normalizedText = text.replace(/\s+/g, ' ').trim();
let chunks;
switch (finalConfig.strategy) {
case 'character':
chunks = chunkByCharacter(normalizedText, finalConfig);
break;
case 'sentence':
chunks = chunkBySentence(normalizedText, finalConfig);
break;
case 'paragraph':
chunks = chunkByParagraph(text, finalConfig); // Keep original for paragraphs
break;
case 'token':
chunks = chunkByToken(normalizedText, finalConfig);
break;
default:
chunks = chunkBySentence(normalizedText, finalConfig);
}
return {
chunks,
originalLength: text.length,
totalChunks: chunks.length,
config: finalConfig,
};
}
/**
* Simple character-based chunking with overlap
*/
function chunkByCharacter(text, config) {
const chunks = [];
const { maxChunkSize, overlap } = config;
let pos = 0;
let index = 0;
while (pos < text.length) {
const endPos = Math.min(pos + maxChunkSize, text.length);
const chunkText = text.slice(pos, endPos);
chunks.push({
text: chunkText,
index,
startPos: pos,
endPos,
length: chunkText.length,
tokenCount: Math.ceil(chunkText.length / 4),
});
// Move position with overlap
pos = endPos - overlap;
if (pos >= text.length - overlap) {
break;
}
index++;
}
return chunks;
}
/**
* Sentence-aware chunking - keeps sentences intact
*/
function chunkBySentence(text, config) {
const { maxChunkSize, overlap, minChunkSize } = config;
// Split into sentences
const sentences = text.split(SENTENCE_ENDINGS).filter(s => s.trim().length > 0);
const chunks = [];
let currentChunk = '';
let currentStart = 0;
let index = 0;
let textPos = 0;
for (const sentence of sentences) {
const trimmedSentence = sentence.trim();
// If adding this sentence exceeds max size, save current chunk
if (currentChunk.length + trimmedSentence.length > maxChunkSize && currentChunk.length >= minChunkSize) {
chunks.push({
text: currentChunk.trim(),
index,
startPos: currentStart,
endPos: textPos,
length: currentChunk.length,
tokenCount: Math.ceil(currentChunk.length / 4),
});
// Start new chunk with overlap (last part of previous chunk)
const overlapText = currentChunk.slice(-overlap);
currentChunk = overlapText + ' ' + trimmedSentence;
currentStart = textPos - overlap;
index++;
}
else {
currentChunk += (currentChunk.length > 0 ? ' ' : '') + trimmedSentence;
}
textPos += trimmedSentence.length + 1;
}
// Add final chunk
if (currentChunk.trim().length > 0) {
chunks.push({
text: currentChunk.trim(),
index,
startPos: currentStart,
endPos: text.length,
length: currentChunk.length,
tokenCount: Math.ceil(currentChunk.length / 4),
});
}
return chunks;
}
/**
* Paragraph-aware chunking
*/
function chunkByParagraph(text, config) {
const { maxChunkSize, minChunkSize } = config;
// Split by paragraph breaks
const paragraphs = text.split(PARAGRAPH_BREAKS).filter(p => p.trim().length > 0);
const chunks = [];
let currentChunk = '';
let currentStart = 0;
let index = 0;
let textPos = 0;
for (const paragraph of paragraphs) {
const trimmedPara = paragraph.trim();
// If single paragraph exceeds max, fall back to sentence chunking
if (trimmedPara.length > maxChunkSize) {
if (currentChunk.length > 0) {
chunks.push({
text: currentChunk.trim(),
index,
startPos: currentStart,
endPos: textPos,
length: currentChunk.length,
tokenCount: Math.ceil(currentChunk.length / 4),
});
index++;
}
// Chunk the large paragraph by sentence
const subChunks = chunkBySentence(trimmedPara, config);
for (const subChunk of subChunks) {
chunks.push({
...subChunk,
index,
startPos: textPos + subChunk.startPos,
endPos: textPos + subChunk.endPos,
});
index++;
}
currentChunk = '';
currentStart = textPos + trimmedPara.length;
}
else if (currentChunk.length + trimmedPara.length > maxChunkSize && currentChunk.length >= minChunkSize) {
chunks.push({
text: currentChunk.trim(),
index,
startPos: currentStart,
endPos: textPos,
length: currentChunk.length,
tokenCount: Math.ceil(currentChunk.length / 4),
});
currentChunk = trimmedPara;
currentStart = textPos;
index++;
}
else {
currentChunk += (currentChunk.length > 0 ? '\n\n' : '') + trimmedPara;
}
textPos += trimmedPara.length + 2; // +2 for paragraph break
}
// Add final chunk
if (currentChunk.trim().length > 0) {
chunks.push({
text: currentChunk.trim(),
index,
startPos: currentStart,
endPos: text.length,
length: currentChunk.length,
tokenCount: Math.ceil(currentChunk.length / 4),
});
}
return chunks;
}
/**
* Token-based chunking (approximate - uses chars/4 as estimate)
*/
function chunkByToken(text, config) {
// Convert token limits to character limits (rough estimate: 1 token ≈ 4 chars)
const charConfig = {
...config,
maxChunkSize: config.maxChunkSize * 4,
overlap: config.overlap * 4,
minChunkSize: config.minChunkSize * 4,
};
// Use sentence-aware chunking with converted limits
return chunkBySentence(text, charConfig);
}
/**
* Estimate token count for text
*/
export function estimateTokens(text) {
// Simple estimation: ~4 characters per token on average
return Math.ceil(text.length / 4);
}
/**
* Reconstruct original text from chunks (approximate)
*/
export function reconstructFromChunks(chunks) {
if (chunks.length === 0)
return '';
if (chunks.length === 1)
return chunks[0].text;
// Sort by index
const sorted = [...chunks].sort((a, b) => a.index - b.index);
// Simple concatenation (overlap removal is approximate)
let result = sorted[0].text;
for (let i = 1; i < sorted.length; i++) {
const chunk = sorted[i];
const prevChunk = sorted[i - 1];
// Find overlap by looking for common suffix/prefix
const overlapSize = Math.min(100, prevChunk.text.length, chunk.text.length);
const prevSuffix = prevChunk.text.slice(-overlapSize);
const currPrefix = chunk.text.slice(0, overlapSize);
// Find longest common overlap
let overlap = 0;
for (let len = overlapSize; len > 0; len--) {
if (currPrefix.startsWith(prevSuffix.slice(-len))) {
overlap = len;
break;
}
}
result += ' ' + chunk.text.slice(overlap);
}
return result.replace(/\s+/g, ' ').trim();
}
//# sourceMappingURL=chunking.js.map