/** * CLAUDE.md Analyzer & Auto-Optimizer * * Quantifiable, verifiable analysis of CLAUDE.md files. * Measures structure quality, coverage, enforceability, and produces * a numeric score (0-100) that can be tracked over time. * * The auto-optimizer takes analysis results and produces a concrete * list of changes that would improve the score. Changes can be applied * programmatically and the score re-measured to verify improvement. * * @module @claude-flow/guidance/analyzer */ import type { ProofEnvelope } from './proof.js'; /** Score breakdown for a single dimension (0-100 each) */ export interface DimensionScore { /** Dimension name */ name: string; /** Score 0-100 */ score: number; /** Maximum possible score */ max: number; /** Weight in composite calculation */ weight: number; /** Human-readable findings */ findings: string[]; } /** Complete analysis result */ export interface AnalysisResult { /** Composite score 0-100 */ compositeScore: number; /** Letter grade A-F */ grade: string; /** Per-dimension scores */ dimensions: DimensionScore[]; /** Structural metrics */ metrics: AnalysisMetrics; /** Actionable improvement suggestions */ suggestions: Suggestion[]; /** Timestamp */ analyzedAt: number; } /** Raw metrics extracted from the file */ export interface AnalysisMetrics { /** Total lines */ totalLines: number; /** Non-blank, non-comment lines */ contentLines: number; /** Number of markdown headings */ headingCount: number; /** Number of H2 sections */ sectionCount: number; /** Estimated constitution lines (first section block) */ constitutionLines: number; /** Number of rule-like statements (imperative sentences) */ ruleCount: number; /** Number of code blocks */ codeBlockCount: number; /** Number of NEVER/ALWAYS/MUST statements */ enforcementStatements: number; /** Number of framework/tool mentions */ toolMentions: number; /** Estimated shard count after compilation */ estimatedShards: number; /** Has build command */ hasBuildCommand: boolean; /** Has test command */ hasTestCommand: boolean; /** Has security section */ hasSecuritySection: boolean; /** Has architecture section */ hasArchitectureSection: boolean; /** Lines in longest section */ longestSectionLines: number; /** Has @import directives */ hasImports: boolean; /** Number of domain-specific rules */ domainRuleCount: number; } /** A concrete improvement suggestion */ export interface Suggestion { /** What to change */ action: 'add' | 'remove' | 'restructure' | 'split' | 'strengthen'; /** Priority */ priority: 'high' | 'medium' | 'low'; /** Which dimension this improves */ dimension: string; /** Human-readable description */ description: string; /** Estimated score improvement */ estimatedImprovement: number; /** Concrete text to add/modify (if applicable) */ patch?: string; } /** Before/after benchmark result */ export interface BenchmarkResult { before: AnalysisResult; after: AnalysisResult; delta: number; improvements: DimensionDelta[]; regressions: DimensionDelta[]; } interface DimensionDelta { dimension: string; before: number; after: number; delta: number; } /** Context size preset for optimization */ export type ContextSize = 'compact' | 'standard' | 'full'; /** Configuration for size-aware optimization */ export interface OptimizeOptions { /** Target context size */ contextSize?: ContextSize; /** Optional local overlay content */ localContent?: string; /** Maximum optimization iterations */ maxIterations?: number; /** Target score (stop when reached) */ targetScore?: number; /** HMAC key for proof chain (enables cryptographic proof of optimization) */ proofKey?: string; } /** Result of headless benchmark via claude -p */ export interface HeadlessBenchmarkResult { /** Before optimization metrics */ before: { analysis: AnalysisResult; suitePassRate: number; violationCount: number; taskResults: HeadlessTaskResult[]; }; /** After optimization metrics */ after: { analysis: AnalysisResult; suitePassRate: number; violationCount: number; taskResults: HeadlessTaskResult[]; }; /** Score delta */ delta: number; /** Proof chain with cryptographic verification */ proofChain: ProofEnvelope[]; /** Formatted report */ report: string; } /** Result of a single headless task run */ export interface HeadlessTaskResult { taskId: string; prompt: string; passed: boolean; violations: string[]; durationMs: number; } /** * Analyze a CLAUDE.md file and produce quantifiable scores. * * Scores 6 dimensions (0-100 each), weighted into a composite: * - Structure (20%): headings, sections, length, organization * - Coverage (20%): build/test/security/architecture/domain * - Enforceability (25%): NEVER/ALWAYS statements, concrete rules * - Compilability (15%): how well it compiles to constitution + shards * - Clarity (10%): code blocks, examples, specificity * - Completeness (10%): missing common sections */ export declare function analyze(content: string, localContent?: string): AnalysisResult; /** * Run a before/after benchmark. * Returns the delta and per-dimension changes. */ export declare function benchmark(before: string, after: string, localContent?: string): BenchmarkResult; /** * Auto-optimize a CLAUDE.md file by applying high-priority suggestions. * Returns the optimized content and the benchmark result. */ export declare function autoOptimize(content: string, localContent?: string, maxIterations?: number): { optimized: string; benchmark: BenchmarkResult; appliedSuggestions: Suggestion[]; }; /** * Context-size-aware optimization that restructures content to reach 90%+. * * Unlike autoOptimize (which only appends), this function: * 1. Splits oversized sections into subsections * 2. Extracts enforcement prose into list-format rules * 3. Trims the constitution to budget * 4. Removes redundant content * 5. Adds missing coverage sections * 6. Applies iterative patch suggestions * * @param content - CLAUDE.md content * @param options - Optimization options with contextSize and targetScore * @returns Optimized content, benchmark, and proof chain */ export declare function optimizeForSize(content: string, options?: OptimizeOptions): { optimized: string; benchmark: BenchmarkResult; appliedSteps: string[]; proof: ProofEnvelope[]; }; /** * Run a headless benchmark using `claude -p` to measure actual agent * compliance before and after optimization. * * Requires `claude` CLI to be installed. Uses the proof chain to create * tamper-evident records of each test run. * * @param originalContent - Original CLAUDE.md * @param optimizedContent - Optimized CLAUDE.md * @param options - Options including proof key and executor */ export declare function headlessBenchmark(originalContent: string, optimizedContent: string, options?: { proofKey?: string; executor?: IHeadlessExecutor; tasks?: HeadlessBenchmarkTask[]; workDir?: string; }): Promise; /** Executor interface for headless claude commands */ export interface IHeadlessExecutor { execute(prompt: string, workDir: string): Promise<{ stdout: string; stderr: string; exitCode: number; }>; } /** * Content-aware executor that adapts behavior based on CLAUDE.md content. * * When `validateEffect()` detects this interface, it calls `setContext()` * before each phase (before/after) so the executor can vary its responses * based on the quality of the loaded CLAUDE.md. This is the key mechanism * that makes the empirical validation meaningful — without it, the same * executor produces identical adherence for both phases. */ export interface IContentAwareExecutor extends IHeadlessExecutor { /** Set the CLAUDE.md content that the executor should use as behavioral context */ setContext(claudeMdContent: string): void; } /** Benchmark task definition */ interface HeadlessBenchmarkTask { id: string; prompt: string; expectForbidden: string[]; expectPresent: string[]; } /** * Format analysis result as a human-readable report. */ export declare function formatReport(result: AnalysisResult): string; /** * Format benchmark result as a comparison table. */ export declare function formatBenchmark(result: BenchmarkResult): string; /** * An assertion about expected agent behavior. */ export interface ValidationAssertion { /** What to check */ type: 'must-contain' | 'must-not-contain' | 'must-match-pattern' | 'must-mention-tool'; /** The value to check (string literal or regex pattern for must-match-pattern) */ value: string; /** How bad is a failure? */ severity: 'critical' | 'major' | 'minor'; } /** * A compliance task that tests whether the agent adheres to a specific * dimension's expected behavior. */ export interface ValidationTask { /** Unique task identifier */ id: string; /** Which scoring dimension this task validates */ dimension: string; /** The prompt to send to the agent */ prompt: string; /** Assertions about the agent's output */ assertions: ValidationAssertion[]; /** Importance weight within its dimension (0-1) */ weight: number; } /** * Result of running a single validation task. */ export interface ValidationTaskResult { taskId: string; dimension: string; passed: boolean; assertionResults: { assertion: ValidationAssertion; passed: boolean; detail: string; }[]; output: string; durationMs: number; } /** * A single validation run against one CLAUDE.md version. */ export interface ValidationRun { /** Analysis of the CLAUDE.md used */ analysis: AnalysisResult; /** Per-task results */ taskResults: ValidationTaskResult[]; /** Overall adherence rate (0-1) — weighted by severity */ adherenceRate: number; /** Per-dimension adherence rates */ dimensionAdherence: Record; /** Timestamp */ timestamp: number; } /** * Statistical correlation between score changes and behavioral changes. */ export interface CorrelationResult { /** Per-dimension score vs adherence comparison */ dimensionCorrelations: { dimension: string; scoreBefore: number; scoreAfter: number; scoreDelta: number; adherenceBefore: number; adherenceAfter: number; adherenceDelta: number; /** Did score and adherence move in the same direction? */ concordant: boolean; }[]; /** Pearson correlation coefficient (-1 to 1) */ pearsonR: number; /** Spearman rank correlation coefficient (-1 to 1) — more robust for small samples */ spearmanRho: number; /** Cohen's d effect size (null if insufficient data) */ cohensD: number | null; /** Human-readable effect size label */ effectSizeLabel: string; /** Number of data points */ n: number; /** Is the correlation statistically significant? (|r| > threshold for n) */ significant: boolean; /** Overall verdict */ verdict: 'positive-effect' | 'negative-effect' | 'no-effect' | 'inconclusive'; } /** * Complete validation report proving (or disproving) that score improvements * lead to behavioral improvements. */ export interface ValidationReport { /** Run against original CLAUDE.md */ before: ValidationRun; /** Run against optimized CLAUDE.md */ after: ValidationRun; /** Statistical correlation analysis */ correlation: CorrelationResult; /** Cryptographic proof chain */ proofChain: ProofEnvelope[]; /** Formatted human-readable report */ report: string; } /** * Empirically validate that score improvements produce behavioral improvements. * * Runs a suite of compliance tasks against both the original and optimized * CLAUDE.md, then computes statistical correlations between per-dimension * score deltas and per-dimension adherence rate deltas. * * **Content-aware executors**: If the executor implements `IContentAwareExecutor`, * `setContext()` is called before each phase with the corresponding CLAUDE.md * content. This is the key mechanism that allows the executor to vary its * behavior based on the quality of the loaded guidance — without it, the same * executor produces identical adherence for both phases. * * The result includes: * - Per-dimension concordance (did score and adherence move together?) * - Pearson r and Spearman rho correlation coefficients * - Cohen's d effect size with interpretation * - A verdict: positive-effect, negative-effect, no-effect, or inconclusive * - A formatted report with full task breakdown * - Optional proof chain for tamper-evident audit trail * * @param originalContent - Original CLAUDE.md content * @param optimizedContent - Optimized CLAUDE.md content * @param options - Executor, tasks, proof key, work directory, trials * @returns ValidationReport with statistical evidence */ export declare function validateEffect(originalContent: string, optimizedContent: string, options?: { executor?: IHeadlessExecutor; tasks?: ValidationTask[]; proofKey?: string; workDir?: string; /** Number of trials per phase (default 1). Higher values average out noise. */ trials?: number; }): Promise; /** Task class categories for the A/B benchmark */ export type ABTaskClass = 'bug-fix' | 'feature' | 'refactor' | 'security' | 'deployment' | 'test' | 'performance'; /** A single benchmark task representing a real Claude Flow scenario */ export interface ABTask { /** Unique task identifier */ id: string; /** Human-readable description */ description: string; /** Task class for grouping results */ taskClass: ABTaskClass; /** Prompt sent to the executor */ prompt: string; /** Assertions to evaluate pass/fail */ assertions: ValidationAssertion[]; /** Violation patterns to detect via gate simulation */ gatePatterns: ABGatePattern[]; } /** A pattern the gate simulator checks for in executor output */ export interface ABGatePattern { /** What kind of violation this detects */ category: 'destructive-command' | 'hardcoded-secret' | 'force-push' | 'unsafe-type' | 'skipped-hook' | 'missing-test' | 'policy-violation'; /** Regex pattern to match in output */ pattern: string; /** Severity of the violation */ severity: 'critical' | 'major' | 'minor'; } /** Result for a single task in either config A or config B */ export interface ABTaskResult { /** Task ID */ taskId: string; /** Task class */ taskClass: ABTaskClass; /** Did all assertions pass? */ passed: boolean; /** Assertion evaluation details */ assertionResults: { assertion: ValidationAssertion; passed: boolean; detail: string; }[]; /** Gate violations detected */ violations: { category: string; pattern: string; severity: string; }[]; /** Would a human need to intervene? (any critical violation) */ humanIntervention: boolean; /** Simulated tool call count (extracted from output) */ toolCalls: number; /** Simulated token spend (estimated from output length) */ tokenSpend: number; /** Raw executor output */ output: string; /** Execution duration in ms */ durationMs: number; } /** Aggregated KPIs for one config (A or B) */ export interface ABMetrics { /** Fraction of tasks that passed (0-1) */ successRate: number; /** Total wall clock time in ms */ wallClockMs: number; /** Average tool calls per task */ avgToolCalls: number; /** Average token spend per task */ avgTokenSpend: number; /** Total gate violations */ totalViolations: number; /** Tasks requiring human intervention */ humanInterventions: number; /** Per-task-class success rates */ classSuccessRates: Record; /** Composite score: success_rate - 0.1*norm_cost - 0.2*violations - 0.1*interventions */ compositeScore: number; } /** Complete A/B benchmark report */ export interface ABReport { /** Config A results (no control plane) */ configA: { label: string; taskResults: ABTaskResult[]; metrics: ABMetrics; }; /** Config B results (with Phase 1 control plane) */ configB: { label: string; taskResults: ABTaskResult[]; metrics: ABMetrics; }; /** Composite score delta (B - A) */ compositeDelta: number; /** Per-task-class deltas */ classDeltas: Record; /** Does B beat A by ≥0.2 on composite across ≥3 task classes? */ categoryShift: boolean; /** Proof chain envelopes */ proofChain: ProofEnvelope[]; /** Formatted human-readable report */ report: string; } /** * Run an A/B benchmark comparing agent performance with and without * the Guidance Control Plane. * * **Config A** (baseline): No guidance — executor runs without setContext() * **Config B** (treatment): With guidance — executor gets setContext(claudeMd) + * gate simulation on every output * * The 20 tasks span 7 task classes drawn from real Claude Flow repo history: * bug-fix (3), feature (5), refactor (3), security (3), deployment (2), * test (2), performance (2). * * KPIs tracked per task: * - success rate, tool calls, token spend, violations, human interventions * * Composite score: `success_rate - 0.1*norm_cost - 0.2*violations - 0.1*interventions` * * **Success criterion**: B beats A by ≥0.2 on composite across ≥3 task classes * = "category shift" * * @param claudeMdContent - The CLAUDE.md content used for Config B * @param options - Executor, tasks, proof key, work directory * @returns ABReport with full per-task and per-class breakdown */ export declare function abBenchmark(claudeMdContent: string, options?: { executor?: IHeadlessExecutor; tasks?: ABTask[]; proofKey?: string; workDir?: string; }): Promise; /** * Get the default 20 A/B benchmark tasks. * Exported for test customization and documentation. */ export declare function getDefaultABTasks(): ABTask[]; export {}; //# sourceMappingURL=analyzer.d.ts.map