192 lines
5.6 KiB
JavaScript
192 lines
5.6 KiB
JavaScript
/**
|
|
* Embedding Normalization Utilities
|
|
*
|
|
* Features:
|
|
* - L2 (Euclidean) normalization
|
|
* - L1 (Manhattan) normalization
|
|
* - Min-max normalization
|
|
* - Z-score standardization
|
|
* - Batch normalization
|
|
*/
|
|
/**
|
|
* L2 (Euclidean) normalize embedding to unit length
|
|
* Most common for cosine similarity
|
|
*
|
|
* @param embedding - Input embedding vector
|
|
* @param epsilon - Small value to prevent division by zero
|
|
* @returns Normalized embedding with ||v|| = 1
|
|
*/
|
|
export function l2Normalize(embedding, epsilon = 1e-12) {
|
|
const result = embedding instanceof Float32Array
|
|
? new Float32Array(embedding.length)
|
|
: new Float32Array(embedding.length);
|
|
// Calculate L2 norm (Euclidean length)
|
|
let sumSquares = 0;
|
|
for (let i = 0; i < embedding.length; i++) {
|
|
sumSquares += embedding[i] * embedding[i];
|
|
}
|
|
const norm = Math.sqrt(sumSquares);
|
|
const scale = norm > epsilon ? 1 / norm : 0;
|
|
// Normalize
|
|
for (let i = 0; i < embedding.length; i++) {
|
|
result[i] = embedding[i] * scale;
|
|
}
|
|
return result;
|
|
}
|
|
/**
|
|
* L2 normalize embedding in-place (modifies original array)
|
|
*/
|
|
export function l2NormalizeInPlace(embedding, epsilon = 1e-12) {
|
|
let sumSquares = 0;
|
|
for (let i = 0; i < embedding.length; i++) {
|
|
sumSquares += embedding[i] * embedding[i];
|
|
}
|
|
const norm = Math.sqrt(sumSquares);
|
|
const scale = norm > epsilon ? 1 / norm : 0;
|
|
for (let i = 0; i < embedding.length; i++) {
|
|
embedding[i] *= scale;
|
|
}
|
|
return embedding;
|
|
}
|
|
/**
|
|
* L1 (Manhattan) normalize embedding
|
|
* Sum of absolute values = 1
|
|
*/
|
|
export function l1Normalize(embedding, epsilon = 1e-12) {
|
|
const result = new Float32Array(embedding.length);
|
|
// Calculate L1 norm (sum of absolute values)
|
|
let sumAbs = 0;
|
|
for (let i = 0; i < embedding.length; i++) {
|
|
sumAbs += Math.abs(embedding[i]);
|
|
}
|
|
const scale = sumAbs > epsilon ? 1 / sumAbs : 0;
|
|
for (let i = 0; i < embedding.length; i++) {
|
|
result[i] = embedding[i] * scale;
|
|
}
|
|
return result;
|
|
}
|
|
/**
|
|
* Min-max normalize embedding to [0, 1] range
|
|
*/
|
|
export function minMaxNormalize(embedding, epsilon = 1e-12) {
|
|
const result = new Float32Array(embedding.length);
|
|
// Find min and max
|
|
let min = Infinity;
|
|
let max = -Infinity;
|
|
for (let i = 0; i < embedding.length; i++) {
|
|
if (embedding[i] < min)
|
|
min = embedding[i];
|
|
if (embedding[i] > max)
|
|
max = embedding[i];
|
|
}
|
|
const range = max - min;
|
|
const scale = range > epsilon ? 1 / range : 0;
|
|
for (let i = 0; i < embedding.length; i++) {
|
|
result[i] = (embedding[i] - min) * scale;
|
|
}
|
|
return result;
|
|
}
|
|
/**
|
|
* Z-score standardize embedding (mean=0, std=1)
|
|
*/
|
|
export function zScoreNormalize(embedding, epsilon = 1e-12) {
|
|
const result = new Float32Array(embedding.length);
|
|
const n = embedding.length;
|
|
// Calculate mean
|
|
let sum = 0;
|
|
for (let i = 0; i < n; i++) {
|
|
sum += embedding[i];
|
|
}
|
|
const mean = sum / n;
|
|
// Calculate standard deviation
|
|
let sumSquaredDiff = 0;
|
|
for (let i = 0; i < n; i++) {
|
|
const diff = embedding[i] - mean;
|
|
sumSquaredDiff += diff * diff;
|
|
}
|
|
const std = Math.sqrt(sumSquaredDiff / n);
|
|
const scale = std > epsilon ? 1 / std : 0;
|
|
// Standardize
|
|
for (let i = 0; i < n; i++) {
|
|
result[i] = (embedding[i] - mean) * scale;
|
|
}
|
|
return result;
|
|
}
|
|
/**
|
|
* Normalize embedding using specified method
|
|
*/
|
|
export function normalize(embedding, options = {}) {
|
|
const { type = 'l2', epsilon = 1e-12, inPlace = false } = options;
|
|
if (type === 'none') {
|
|
return embedding instanceof Float32Array
|
|
? embedding
|
|
: new Float32Array(embedding);
|
|
}
|
|
if (inPlace && embedding instanceof Float32Array && type === 'l2') {
|
|
return l2NormalizeInPlace(embedding, epsilon);
|
|
}
|
|
switch (type) {
|
|
case 'l2':
|
|
return l2Normalize(embedding, epsilon);
|
|
case 'l1':
|
|
return l1Normalize(embedding, epsilon);
|
|
case 'minmax':
|
|
return minMaxNormalize(embedding, epsilon);
|
|
case 'zscore':
|
|
return zScoreNormalize(embedding, epsilon);
|
|
default:
|
|
return l2Normalize(embedding, epsilon);
|
|
}
|
|
}
|
|
/**
|
|
* Batch normalize multiple embeddings
|
|
*/
|
|
export function normalizeBatch(embeddings, options = {}) {
|
|
return embeddings.map(emb => normalize(emb, options));
|
|
}
|
|
/**
|
|
* Calculate L2 norm of embedding
|
|
*/
|
|
export function l2Norm(embedding) {
|
|
let sumSquares = 0;
|
|
for (let i = 0; i < embedding.length; i++) {
|
|
sumSquares += embedding[i] * embedding[i];
|
|
}
|
|
return Math.sqrt(sumSquares);
|
|
}
|
|
/**
|
|
* Check if embedding is already normalized (L2 norm ≈ 1)
|
|
*/
|
|
export function isNormalized(embedding, tolerance = 1e-6) {
|
|
const norm = l2Norm(embedding);
|
|
return Math.abs(norm - 1) < tolerance;
|
|
}
|
|
/**
|
|
* Center embeddings by subtracting mean across batch
|
|
* Useful for improving similarity metrics
|
|
*/
|
|
export function centerEmbeddings(embeddings) {
|
|
if (embeddings.length === 0)
|
|
return [];
|
|
const dim = embeddings[0].length;
|
|
const n = embeddings.length;
|
|
// Calculate mean for each dimension
|
|
const mean = new Float32Array(dim);
|
|
for (const emb of embeddings) {
|
|
for (let i = 0; i < dim; i++) {
|
|
mean[i] += emb[i];
|
|
}
|
|
}
|
|
for (let i = 0; i < dim; i++) {
|
|
mean[i] /= n;
|
|
}
|
|
// Subtract mean from each embedding
|
|
return embeddings.map(emb => {
|
|
const centered = new Float32Array(dim);
|
|
for (let i = 0; i < dim; i++) {
|
|
centered[i] = emb[i] - mean[i];
|
|
}
|
|
return centered;
|
|
});
|
|
}
|
|
//# sourceMappingURL=normalization.js.map
|