68 lines
1.8 KiB
TypeScript
68 lines
1.8 KiB
TypeScript
/**
|
|
* Document Chunking Utilities
|
|
*
|
|
* Features:
|
|
* - Configurable chunk size and overlap
|
|
* - Sentence-aware splitting
|
|
* - Paragraph-aware splitting
|
|
* - Token-based chunking (approximate)
|
|
* - Metadata tracking for reconstruction
|
|
*/
|
|
/**
|
|
* Chunking configuration
|
|
*/
|
|
export interface ChunkingConfig {
|
|
/** Maximum chunk size in characters (default: 512) */
|
|
maxChunkSize?: number;
|
|
/** Overlap between chunks in characters (default: 50) */
|
|
overlap?: number;
|
|
/** Strategy for splitting (default: 'sentence') */
|
|
strategy?: 'character' | 'sentence' | 'paragraph' | 'token';
|
|
/** Minimum chunk size (default: 100) */
|
|
minChunkSize?: number;
|
|
/** Include metadata with chunks */
|
|
includeMetadata?: boolean;
|
|
}
|
|
/**
|
|
* Chunk result with metadata
|
|
*/
|
|
export interface Chunk {
|
|
/** Chunk text content */
|
|
text: string;
|
|
/** Original index in document */
|
|
index: number;
|
|
/** Start position in original text */
|
|
startPos: number;
|
|
/** End position in original text */
|
|
endPos: number;
|
|
/** Character count */
|
|
length: number;
|
|
/** Approximate token count (chars / 4) */
|
|
tokenCount: number;
|
|
}
|
|
/**
|
|
* Chunked document result
|
|
*/
|
|
export interface ChunkedDocument {
|
|
/** Array of chunks */
|
|
chunks: Chunk[];
|
|
/** Original text length */
|
|
originalLength: number;
|
|
/** Total chunks created */
|
|
totalChunks: number;
|
|
/** Configuration used */
|
|
config: Required<ChunkingConfig>;
|
|
}
|
|
/**
|
|
* Split text into chunks with overlap
|
|
*/
|
|
export declare function chunkText(text: string, config?: ChunkingConfig): ChunkedDocument;
|
|
/**
|
|
* Estimate token count for text
|
|
*/
|
|
export declare function estimateTokens(text: string): number;
|
|
/**
|
|
* Reconstruct original text from chunks (approximate)
|
|
*/
|
|
export declare function reconstructFromChunks(chunks: Chunk[]): string;
|
|
//# sourceMappingURL=chunking.d.ts.map
|