MeshChatX/scripts/license_scope_mapper.go

// SPDX-License-Identifier: 0BSD
//
// license_scope_mapper analyzes file similarity against an upstream repository
// and emits per-file SPDX license recommendations with optional header updates.
//
// Quick start:
//
//	go run scripts/license_scope_mapper.go \
//	  --repo-root "/run/media/user1/projects/reticulum-meshchatX" \
//	  --overrides "scripts/license_scope_overrides.json" \
//	  --write-headers --replace-existing
//
// Environment variables are supported with the LICENSE_SCOPE_ prefix
// (for example LICENSE_SCOPE_SCAN_EXTS and LICENSE_SCOPE_HEADER_EXTS).
//
// Important caveats (read before relying on output):
//
//  1. Textual similarity is not a legal test for derivative-work status. A file
//     forked from upstream and heavily refactored or renamed is still a
//     derivative work and still carries upstream license obligations. The
//     pure-0BSD branch is therefore opt-in (--allow-pure-0bsd) and gated
//     behind --pure-0bsd-confirm-no-derivation to avoid accidental
//     relicensing of upstream-derived material.
//
//  2. --clean-non-target-spdx and --replace-existing rewrite or delete
//     SPDX markers in tracked files. Run with --report-json first and
//     review the diff before enabling either flag; third-party SPDX lines
//     in vendored or generated files must not be silently stripped.
//
//  3. The default base license is "0BSD AND MIT", which means BOTH licenses
//     apply simultaneously to their respective contributions. This is the
//     safe default for files where provenance cannot be cleanly separated.
package main

import (
	"bufio"
	"bytes"
	"context"
	"crypto/sha256"
	"encoding/hex"
	"encoding/json"
	"errors"
	"flag"
	"fmt"
	"hash/fnv"
	"io/fs"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"runtime"
	"sort"
	"strconv"
	"strings"
	"time"
)

const (
	envPrefix          = "LICENSE_SCOPE_"
	defaultUpstreamURL = "https://github.com/liamcottle/reticulum-meshchat"
)

var spdxPattern = regexp.MustCompile(`SPDX-License-Identifier:\s*([A-Za-z0-9.\-+ ()]+)`)

type config struct {
	repoRoot                    string
	upstreamPath                string
	upstreamURL                 string
	upstreamBranch              string
	cloneTimeout                time.Duration
	scanExtsRaw                 string
	headerExtsRaw               string
	excludeDirsRaw              string
	excludePathsRaw             string
	maxFileSizeBytes            int64
	mitThreshold                float64
	mixedThreshold              float64
	allowPure0BSD               bool
	pure0BSDConfirmNoDerivation bool
	pure0BSDThreshold           float64
	baseLicense                 string
	maxNameCandidates           int
	writeHeaders                bool
	replaceExisting             bool
	cleanNonTarget              bool
	overridesPath               string
	reportJSON                  string
	explainPathsRaw             string
	explainAll                  bool

	scanExts         map[string]struct{}
	headerExts       map[string]struct{}
	excludedDirs     map[string]struct{}
	excludedContains []string
	explainPaths     map[string]struct{}
}

type fileData struct {
	absPath     string
	relPath     string
	baseName    string
	normHash    string
	lineSet     map[string]struct{}
	bigramSet   map[uint64]struct{}
	shingle5Set map[uint64]struct{}
	shingle7Set map[uint64]struct{}
}

type matchResult struct {
	upstreamRelPath string
	directPathMatch bool
	similarityPct   float64
	lineJaccardPct  float64
	bigramDicePct   float64
	shingle5Pct     float64
	shingle7Pct     float64
	lineCommon      int
	lineUnion       int
	bigramCommon    int
	bigramTotal     int
	shingle5Common  int
	shingle5Union   int
	shingle7Common  int
	shingle7Union   int
}

type reportItem struct {
	Path            string  `json:"path"`
	License         string  `json:"license"`
	SimilarityPct   float64 `json:"similarity_pct"`
	LineJaccardPct  float64 `json:"line_jaccard_pct"`
	BigramDicePct   float64 `json:"bigram_dice_pct"`
	Shingle5Pct     float64 `json:"shingle5_jaccard_pct"`
	Shingle7Pct     float64 `json:"shingle7_jaccard_pct"`
	OriginalPct     float64 `json:"original_pct"`
	MinePct         float64 `json:"mine_pct"`
	UpstreamMatch   string  `json:"upstream_match"`
	DirectPathMatch bool    `json:"direct_path_match"`
	HeaderChanged   bool    `json:"header_changed"`
	ExistingSPDX    string  `json:"existing_spdx,omitempty"`
}

type reportConfig struct {
	RepoRoot                    string  `json:"repo_root"`
	UpstreamURL                 string  `json:"upstream_url"`
	UpstreamBranch              string  `json:"upstream_branch"`
	ScanExtensions              string  `json:"scan_extensions"`
	HeaderExtensions            string  `json:"header_extensions"`
	MitThreshold                float64 `json:"mit_threshold"`
	MixedThreshold              float64 `json:"mixed_threshold"`
	AllowPure0BSD               bool    `json:"allow_pure_0bsd"`
	Pure0BSDConfirmNoDerivation bool    `json:"pure_0bsd_confirm_no_derivation"`
	Pure0BSDThreshold           float64 `json:"pure_0bsd_threshold_pct"`
	BaseLicense                 string  `json:"base_license"`
	MaxNameCandidates           int     `json:"max_name_candidates"`
	WriteHeaders                bool    `json:"write_headers"`
	ReplaceExisting             bool    `json:"replace_existing"`
	CleanNonTarget              bool    `json:"clean_non_target_spdx"`
	OverridesPath               string  `json:"overrides_path,omitempty"`
}

type reportSummary struct {
	FilesAnalyzed        int            `json:"files_analyzed"`
	HeadersUpdated       int            `json:"headers_updated"`
	RemovedNonTargetSPDX int            `json:"removed_non_target_spdx"`
	LicenseCounts        map[string]int `json:"license_counts"`
}

type reportDocument struct {
	GeneratedAt string        `json:"generated_at"`
	Config      reportConfig  `json:"config"`
	Summary     reportSummary `json:"summary"`
	Results     []reportItem  `json:"results"`
}

var commentStyleByExt = map[string]string{
	".py":    "hash",
	".sh":    "hash",
	".bash":  "hash",
	".zsh":   "hash",
	".yml":   "hash",
	".yaml":  "hash",
	".toml":  "hash",
	".ini":   "hash",
	".cfg":   "hash",
	".conf":  "hash",
	".mk":    "hash",
	".js":    "line",
	".ts":    "line",
	".jsx":   "line",
	".tsx":   "line",
	".c":     "line",
	".h":     "line",
	".cpp":   "line",
	".hpp":   "line",
	".java":  "line",
	".go":    "line",
	".rs":    "line",
	".swift": "line",
	".kt":    "line",
	".css":   "block",
	".scss":  "block",
	".less":  "block",
	".html":  "html",
	".xml":   "html",
	".vue":   "html",
	".md":    "html",
}

func main() {
	cfg, err := parseConfig()
	if err != nil {
		exitErr(err)
	}

	emitSafetyNotices(cfg)

	if cfg.cleanNonTarget {
		removed, err := removeSPDXFromNonTargetFiles(cfg)
		if err != nil {
			exitErr(err)
		}
		fmt.Printf("Removed SPDX headers from non-target files: %d\n", removed)
	}

	upstreamRoot := cfg.upstreamPath
	cleanup := func() {}
	if strings.TrimSpace(upstreamRoot) == "" {
		tmpDir, err := os.MkdirTemp("", "license-scope-upstream-*")
		if err != nil {
			exitErr(err)
		}
		cleanup = func() { _ = os.RemoveAll(tmpDir) }
		upstreamRoot = filepath.Join(tmpDir, "upstream")
		if err := cloneUpstream(cfg, upstreamRoot); err != nil {
			cleanup()
			exitErr(err)
		}
	}
	defer cleanup()

	repoRootAbs, _ := filepath.Abs(cfg.repoRoot)
	upstreamAbs, _ := filepath.Abs(upstreamRoot)

	localFiles, err := discoverFiles(cfg, repoRootAbs)
	if err != nil {
		exitErr(err)
	}
	overrides, err := loadOverrides(cfg.overridesPath)
	if err != nil {
		exitErr(err)
	}
	upstreamFiles, err := discoverFiles(cfg, upstreamAbs)
	if err != nil {
		exitErr(err)
	}

	upstreamByRel := make(map[string]fileData, len(upstreamFiles))
	upstreamByBase := make(map[string][]fileData, len(upstreamFiles))
	for _, f := range upstreamFiles {
		upstreamByRel[f.relPath] = f
		upstreamByBase[f.baseName] = append(upstreamByBase[f.baseName], f)
	}

	results := make([]reportItem, 0, len(localFiles))
	classCounts := map[string]int{}
	headersUpdated := 0
	removedNonTarget := 0
	if cfg.cleanNonTarget {
		// We report cleanup count from function call side-effect by rescanning isn't needed.
		// Keep zero here and rely on printed output.
	}

	for _, local := range localFiles {
		match := findBestMatch(local, upstreamByRel, upstreamByBase, cfg.maxNameCandidates)
		license, decisionReason := classify(match, cfg)
		if forced, ok := overrides[local.relPath]; ok {
			license = forced
			decisionReason = "override file matched path"
		}
		classCounts[license]++

		item := reportItem{
			Path:            local.relPath,
			License:         license,
			SimilarityPct:   round2(match.similarityPct),
			LineJaccardPct:  round2(match.lineJaccardPct),
			BigramDicePct:   round2(match.bigramDicePct),
			Shingle5Pct:     round2(match.shingle5Pct),
			Shingle7Pct:     round2(match.shingle7Pct),
			OriginalPct:     round2(match.similarityPct),
			MinePct:         round2(100.0 - match.similarityPct),
			UpstreamMatch:   match.upstreamRelPath,
			DirectPathMatch: match.directPathMatch,
		}

		if cfg.writeHeaders && isHeaderExt(local.relPath, cfg.headerExts) {
			changed, existing, err := upsertSPDX(local.absPath, license, cfg.replaceExisting)
			if err != nil {
				fmt.Fprintf(os.Stderr, "SPDX update failed for %s: %v\n", local.relPath, err)
			} else {
				item.HeaderChanged = changed
				item.ExistingSPDX = existing
				if changed {
					headersUpdated++
				}
			}
		}

		if shouldExplainPath(local.relPath, cfg.explainAll, cfg.explainPaths) {
			printExplanation(local.relPath, match, license, decisionReason, cfg)
		}

		results = append(results, item)
	}

	sort.Slice(results, func(i, j int) bool { return results[i].Path < results[j].Path })

	doc := reportDocument{
		GeneratedAt: time.Now().UTC().Format(time.RFC3339),
		Config: reportConfig{
			RepoRoot:                    repoRootAbs,
			UpstreamURL:                 cfg.upstreamURL,
			UpstreamBranch:              cfg.upstreamBranch,
			ScanExtensions:              cfg.scanExtsRaw,
			HeaderExtensions:            cfg.headerExtsRaw,
			MitThreshold:                cfg.mitThreshold,
			MixedThreshold:              cfg.mixedThreshold,
			AllowPure0BSD:               cfg.allowPure0BSD,
			Pure0BSDConfirmNoDerivation: cfg.pure0BSDConfirmNoDerivation,
			Pure0BSDThreshold:           cfg.pure0BSDThreshold,
			BaseLicense:                 cfg.baseLicense,
			MaxNameCandidates:           cfg.maxNameCandidates,
			WriteHeaders:                cfg.writeHeaders,
			ReplaceExisting:             cfg.replaceExisting,
			CleanNonTarget:              cfg.cleanNonTarget,
			OverridesPath:               cfg.overridesPath,
		},
		Summary: reportSummary{
			FilesAnalyzed:        len(results),
			HeadersUpdated:       headersUpdated,
			RemovedNonTargetSPDX: removedNonTarget,
			LicenseCounts:        classCounts,
		},
		Results: results,
	}

	reportPath := filepath.Join(repoRootAbs, cfg.reportJSON)
	reportBytes, err := json.MarshalIndent(doc, "", "  ")
	if err != nil {
		exitErr(err)
	}
	if err := os.WriteFile(reportPath, append(reportBytes, '\n'), 0o644); err != nil {
		exitErr(err)
	}

	fmt.Printf("Analyzed files: %d\n", len(results))
	fmt.Printf("License counts -> %s\n", formatCountSummary(classCounts, float64(len(results))))
	fmt.Printf("JSON report: %s\n", reportPath)
	if cfg.writeHeaders {
		fmt.Printf("Headers updated (%s): %d\n", cfg.headerExtsRaw, headersUpdated)
	}
}

func parseConfig() (config, error) {
	cfg := config{}
	flag.StringVar(&cfg.repoRoot, "repo-root", envString("REPO_ROOT", "."), "Path to repository root")
	flag.StringVar(&cfg.upstreamPath, "upstream-path", envString("UPSTREAM_PATH", ""), "Path to local upstream checkout")
	flag.StringVar(&cfg.upstreamURL, "upstream-url", envString("UPSTREAM_URL", defaultUpstreamURL), "Upstream git URL")
	flag.StringVar(&cfg.upstreamBranch, "upstream-branch", envString("UPSTREAM_BRANCH", "master"), "Upstream branch")
	flag.DurationVar(&cfg.cloneTimeout, "clone-timeout", envDuration("CLONE_TIMEOUT", 2*time.Minute), "Timeout for upstream clone")
	flag.StringVar(&cfg.scanExtsRaw, "scan-exts", envString("SCAN_EXTS", ".py,.vue"), "Comma-separated extensions for similarity analysis")
	flag.StringVar(&cfg.headerExtsRaw, "header-exts", envString("HEADER_EXTS", ".py,.vue"), "Comma-separated extensions eligible for SPDX headers")
	flag.StringVar(&cfg.excludeDirsRaw, "exclude-dirs", envString("EXCLUDE_DIRS", ".git,.idea,.vscode,.local,.pnpm-store,.flatpak-builder,node_modules,dist,build,.venv,venv,__pycache__,.pytest_cache,.mypy_cache,.ruff_cache"), "Comma-separated directories to skip")
	flag.StringVar(&cfg.excludePathsRaw, "exclude-path-contains", envString("EXCLUDE_PATH_CONTAINS", ""), "Comma-separated path substrings to skip")
	flag.Int64Var(&cfg.maxFileSizeBytes, "max-file-size-bytes", envInt64("MAX_FILE_SIZE_BYTES", 2_000_000), "Max file size for analysis")
	flag.Float64Var(&cfg.mitThreshold, "mit-threshold", envFloat("MIT_THRESHOLD", 0.85), "Composite similarity threshold for MIT")
	flag.Float64Var(&cfg.mixedThreshold, "mixed-threshold", envFloat("MIXED_THRESHOLD", 0.25), "Composite similarity threshold for mixed license")
	flag.BoolVar(&cfg.allowPure0BSD, "allow-pure-0bsd", envBool("ALLOW_PURE_0BSD", false), "Allow pure 0BSD classification for files with very low similarity and no direct path match. Disabled by default because textual similarity is not a legal test for derivative-work status.")
	flag.BoolVar(&cfg.pure0BSDConfirmNoDerivation, "pure-0bsd-confirm-no-derivation", envBool("PURE_0BSD_CONFIRM_NO_DERIVATION", false), "Required acknowledgement that files classified as pure 0BSD are not derivative works of upstream sources. Without this flag --allow-pure-0bsd is ignored.")
	flag.Float64Var(&cfg.pure0BSDThreshold, "pure-0bsd-threshold", envFloat("PURE_0BSD_THRESHOLD", 1.0), "Max similarity percent for pure 0BSD")
	flag.StringVar(&cfg.baseLicense, "base-license", envString("BASE_LICENSE", "0BSD AND MIT"), "Default SPDX license below mixed threshold")
	flag.IntVar(&cfg.maxNameCandidates, "max-name-candidates", envInt("MAX_NAME_CANDIDATES", 200), "Max basename candidates when direct path match missing")
	flag.BoolVar(&cfg.writeHeaders, "write-headers", envBool("WRITE_HEADERS", false), "Write SPDX headers")
	flag.BoolVar(&cfg.replaceExisting, "replace-existing", envBool("REPLACE_EXISTING", false), "Replace existing SPDX header")
	flag.BoolVar(&cfg.cleanNonTarget, "clean-non-target-spdx", envBool("CLEAN_NON_TARGET_SPDX", false), "Remove SPDX headers from files outside header-exts")
	flag.StringVar(&cfg.overridesPath, "overrides", envString("OVERRIDES", ""), "Path to JSON map of relpath->SPDX")
	flag.StringVar(&cfg.reportJSON, "report-json", envString("REPORT_JSON", "license-scope-report.json"), "JSON report output path")
	flag.StringVar(&cfg.explainPathsRaw, "explain-paths", envString("EXPLAIN_PATHS", ""), "Comma-separated repo-relative paths to explain scoring decisions")
	flag.BoolVar(&cfg.explainAll, "explain-all", envBool("EXPLAIN_ALL", false), "Print scoring explanation for every analyzed file")
	flag.Parse()

	cfg.scanExts = parseExtSet(cfg.scanExtsRaw)
	cfg.headerExts = parseExtSet(cfg.headerExtsRaw)
	cfg.excludedDirs = parseStringSet(cfg.excludeDirsRaw)
	cfg.excludedContains = parseStringList(cfg.excludePathsRaw)
	cfg.explainPaths = parsePathSet(cfg.explainPathsRaw)

	if len(cfg.scanExts) == 0 {
		return cfg, errors.New("scan-exts must include at least one extension")
	}
	if len(cfg.headerExts) == 0 {
		return cfg, errors.New("header-exts must include at least one extension")
	}
	if cfg.mixedThreshold < 0 || cfg.mitThreshold < 0 || cfg.mitThreshold > 1 || cfg.mixedThreshold > cfg.mitThreshold {
		return cfg, errors.New("thresholds must satisfy 0 <= mixed-threshold <= mit-threshold <= 1")
	}
	if cfg.pure0BSDThreshold < 0 || cfg.pure0BSDThreshold > 100 {
		return cfg, errors.New("pure-0bsd-threshold must be in [0,100] percent")
	}
	return cfg, nil
}

func envKey(name string) string { return envPrefix + name }

func envString(name, fallback string) string {
	if v, ok := os.LookupEnv(envKey(name)); ok {
		return v
	}
	return fallback
}

func envBool(name string, fallback bool) bool {
	if v, ok := os.LookupEnv(envKey(name)); ok {
		b, err := strconv.ParseBool(strings.TrimSpace(v))
		if err == nil {
			return b
		}
	}
	return fallback
}

func envInt(name string, fallback int) int {
	if v, ok := os.LookupEnv(envKey(name)); ok {
		i, err := strconv.Atoi(strings.TrimSpace(v))
		if err == nil {
			return i
		}
	}
	return fallback
}

func envInt64(name string, fallback int64) int64 {
	if v, ok := os.LookupEnv(envKey(name)); ok {
		i, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64)
		if err == nil {
			return i
		}
	}
	return fallback
}

func envFloat(name string, fallback float64) float64 {
	if v, ok := os.LookupEnv(envKey(name)); ok {
		f, err := strconv.ParseFloat(strings.TrimSpace(v), 64)
		if err == nil {
			return f
		}
	}
	return fallback
}

func envDuration(name string, fallback time.Duration) time.Duration {
	if v, ok := os.LookupEnv(envKey(name)); ok {
		d, err := time.ParseDuration(strings.TrimSpace(v))
		if err == nil {
			return d
		}
	}
	return fallback
}

func parseStringList(raw string) []string {
	out := []string{}
	for _, token := range strings.Split(raw, ",") {
		t := strings.TrimSpace(token)
		if t != "" {
			out = append(out, t)
		}
	}
	return out
}

func parseStringSet(raw string) map[string]struct{} {
	out := map[string]struct{}{}
	for _, t := range parseStringList(raw) {
		out[t] = struct{}{}
	}
	return out
}

func parsePathSet(raw string) map[string]struct{} {
	out := map[string]struct{}{}
	for _, p := range parseStringList(raw) {
		out[filepath.ToSlash(strings.TrimSpace(p))] = struct{}{}
	}
	return out
}

func parseExtSet(raw string) map[string]struct{} {
	out := map[string]struct{}{}
	for _, token := range parseStringList(raw) {
		ext := strings.ToLower(token)
		if !strings.HasPrefix(ext, ".") {
			ext = "." + ext
		}
		out[ext] = struct{}{}
	}
	return out
}

func cloneUpstream(cfg config, target string) error {
	ctx, cancel := context.WithTimeout(context.Background(), cfg.cloneTimeout)
	defer cancel()
	cmd := exec.CommandContext(ctx, "git", "clone", "--depth", "1", "--branch", cfg.upstreamBranch, cfg.upstreamURL, target)
	cmd.Stdout = os.Stdout
	cmd.Stderr = os.Stderr
	if err := cmd.Run(); err != nil {
		if ctx.Err() == context.DeadlineExceeded {
			return fmt.Errorf("git clone timed out after %s", cfg.cloneTimeout)
		}
		return err
	}
	return nil
}

func discoverFiles(cfg config, root string) ([]fileData, error) {
	out := []fileData{}
	err := filepath.WalkDir(root, func(path string, d fs.DirEntry, walkErr error) error {
		if walkErr != nil {
			return walkErr
		}
		name := d.Name()
		if d.IsDir() {
			if _, skip := cfg.excludedDirs[name]; skip {
				return filepath.SkipDir
			}
			return nil
		}
		rel, err := filepath.Rel(root, path)
		if err != nil {
			return err
		}
		rel = filepath.ToSlash(rel)
		if shouldSkipFile(cfg, rel, path) {
			return nil
		}
		normHash, lineSet, bigrams, s5, s7, err := loadAndNormalize(path)
		if err != nil {
			return nil
		}
		out = append(out, fileData{
			absPath:     path,
			relPath:     rel,
			baseName:    filepath.Base(path),
			normHash:    normHash,
			lineSet:     lineSet,
			bigramSet:   bigrams,
			shingle5Set: s5,
			shingle7Set: s7,
		})
		return nil
	})
	return out, err
}

func shouldSkipFile(cfg config, rel, abs string) bool {
	for _, part := range cfg.excludedContains {
		if strings.Contains(rel, part) {
			return true
		}
	}
	ext := strings.ToLower(filepath.Ext(abs))
	if _, ok := cfg.scanExts[ext]; !ok {
		return true
	}
	if _, ok := commentStyleByExt[ext]; !ok {
		return true
	}
	info, err := os.Stat(abs)
	if err != nil {
		return true
	}
	if info.Size() > cfg.maxFileSizeBytes {
		return true
	}
	return false
}

func loadAndNormalize(path string) (string, map[string]struct{}, map[uint64]struct{}, map[uint64]struct{}, map[uint64]struct{}, error) {
	raw, err := os.ReadFile(path)
	if err != nil {
		return "", nil, nil, nil, nil, err
	}
	if bytes.IndexByte(raw, 0) >= 0 {
		return "", nil, nil, nil, nil, errors.New("binary file")
	}
	sc := bufio.NewScanner(bytes.NewReader(raw))
	sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)

	lines := []string{}
	lineSet := map[string]struct{}{}
	for sc.Scan() {
		line := sc.Text()
		if strings.Contains(line, "SPDX-License-Identifier:") {
			continue
		}
		norm := strings.ToLower(strings.TrimSpace(line))
		if norm == "" {
			continue
		}
		lines = append(lines, norm)
		lineSet[norm] = struct{}{}
	}
	if err := sc.Err(); err != nil {
		return "", nil, nil, nil, nil, err
	}
	sum := sha256.Sum256([]byte(strings.Join(lines, "\n")))
	return hex.EncodeToString(sum[:]), lineSet, makeNGramSet(lines, 2), makeNGramSet(lines, 5), makeNGramSet(lines, 7), nil
}

func findBestMatch(local fileData, byRel map[string]fileData, byBase map[string][]fileData, maxCandidates int) matchResult {
	best := matchResult{}
	if direct, ok := byRel[local.relPath]; ok {
		best = similarityDetail(local, direct)
		best.upstreamRelPath = direct.relPath
		best.directPathMatch = true
	}

	candidates := byBase[local.baseName]
	if len(candidates) > maxCandidates {
		candidates = candidates[:maxCandidates]
	}
	for _, cand := range candidates {
		if cand.relPath == best.upstreamRelPath {
			continue
		}
		d := similarityDetail(local, cand)
		if d.similarityPct > best.similarityPct {
			d.upstreamRelPath = cand.relPath
			d.directPathMatch = cand.relPath == local.relPath
			best = d
		}
	}
	if best.upstreamRelPath == local.relPath && best.upstreamRelPath != "" {
		best.directPathMatch = true
	}
	return best
}

func similarityDetail(a, b fileData) matchResult {
	if a.normHash != "" && a.normHash == b.normHash {
		return matchResult{
			similarityPct:  100,
			lineJaccardPct: 100,
			bigramDicePct:  100,
			shingle5Pct:    100,
			shingle7Pct:    100,
			lineCommon:     len(a.lineSet),
			lineUnion:      len(a.lineSet),
			bigramCommon:   len(a.bigramSet),
			bigramTotal:    len(a.bigramSet) + len(b.bigramSet),
			shingle5Common: len(a.shingle5Set),
			shingle5Union:  len(a.shingle5Set),
			shingle7Common: len(a.shingle7Set),
			shingle7Union:  len(a.shingle7Set),
		}
	}
	lineJ, lineCommon, lineUnion := setJaccardPctWithCounts(a.lineSet, b.lineSet)
	biD, biCommon, biTotal := setDicePctWithCounts(a.bigramSet, b.bigramSet)
	sh5, sh5Common, sh5Union := setJaccardPctWithCounts(a.shingle5Set, b.shingle5Set)
	sh7, sh7Common, sh7Union := setJaccardPctWithCounts(a.shingle7Set, b.shingle7Set)
	composite := combineComposite(lineJ, biD, sh5, sh7, len(a.bigramSet) > 0 && len(b.bigramSet) > 0, len(a.shingle5Set) > 0 && len(b.shingle5Set) > 0, len(a.shingle7Set) > 0 && len(b.shingle7Set) > 0)
	return matchResult{
		similarityPct:  composite,
		lineJaccardPct: lineJ,
		bigramDicePct:  biD,
		shingle5Pct:    sh5,
		shingle7Pct:    sh7,
		lineCommon:     lineCommon,
		lineUnion:      lineUnion,
		bigramCommon:   biCommon,
		bigramTotal:    biTotal,
		shingle5Common: sh5Common,
		shingle5Union:  sh5Union,
		shingle7Common: sh7Common,
		shingle7Union:  sh7Union,
	}
}

// classify maps a similarity result to an SPDX expression. The pure-0BSD
// branch requires both --allow-pure-0bsd and --pure-0bsd-confirm-no-derivation
// because textual similarity alone cannot prove non-derivation; renamed or
// heavily refactored forks of upstream files retain MIT obligations.
func classify(match matchResult, cfg config) (string, string) {
	sim := match.similarityPct / 100.0
	if sim >= cfg.mitThreshold {
		return "MIT", fmt.Sprintf("composite similarity %.2f%% >= MIT threshold %.2f%%", match.similarityPct, cfg.mitThreshold*100.0)
	}
	if cfg.allowPure0BSD && cfg.pure0BSDConfirmNoDerivation && !match.directPathMatch && match.similarityPct <= cfg.pure0BSDThreshold {
		return "0BSD", fmt.Sprintf("low similarity %.2f%% <= pure-0bsd threshold %.2f%% and no direct path match (operator confirmed non-derivation)", match.similarityPct, cfg.pure0BSDThreshold)
	}
	if sim >= cfg.mixedThreshold {
		return "0BSD AND MIT", fmt.Sprintf("composite similarity %.2f%% >= mixed threshold %.2f%%", match.similarityPct, cfg.mixedThreshold*100.0)
	}
	return cfg.baseLicense, fmt.Sprintf("below mixed threshold; fallback to base license %s", cfg.baseLicense)
}

func makeNGramSet(lines []string, n int) map[uint64]struct{} {
	out := map[uint64]struct{}{}
	if n <= 0 || len(lines) < n {
		return out
	}
	for i := 0; i <= len(lines)-n; i++ {
		out[hashNGram(lines[i:i+n])] = struct{}{}
	}
	return out
}

func hashNGram(parts []string) uint64 {
	h := fnv.New64a()
	for i, p := range parts {
		_, _ = h.Write([]byte(p))
		if i+1 < len(parts) {
			_, _ = h.Write([]byte{0})
		}
	}
	return h.Sum64()
}

func combineComposite(lineJ, biD, sh5, sh7 float64, hasBi, hasS5, hasS7 bool) float64 {
	type part struct {
		score  float64
		weight float64
		ok     bool
	}
	parts := []part{
		{lineJ, 0.20, true},
		{biD, 0.30, hasBi},
		{sh5, 0.30, hasS5},
		{sh7, 0.20, hasS7},
	}
	weighted := 0.0
	total := 0.0
	for _, p := range parts {
		if !p.ok {
			continue
		}
		weighted += p.score * p.weight
		total += p.weight
	}
	if total <= 0 {
		return 0
	}
	return weighted / total
}

func setJaccardPctWithCounts[T comparable](a, b map[T]struct{}) (float64, int, int) {
	if len(a) == 0 || len(b) == 0 {
		return 0, 0, len(a) + len(b)
	}
	inter := 0
	for k := range a {
		if _, ok := b[k]; ok {
			inter++
		}
	}
	union := len(a) + len(b) - inter
	if union <= 0 {
		return 0, inter, union
	}
	return float64(inter) / float64(union) * 100.0, inter, union
}

func setDicePctWithCounts[T comparable](a, b map[T]struct{}) (float64, int, int) {
	if len(a) == 0 || len(b) == 0 {
		return 0, 0, len(a) + len(b)
	}
	inter := 0
	for k := range a {
		if _, ok := b[k]; ok {
			inter++
		}
	}
	den := len(a) + len(b)
	if den <= 0 {
		return 0, inter, den
	}
	return 2.0 * float64(inter) / float64(den) * 100.0, inter, den
}

func loadOverrides(path string) (map[string]string, error) {
	if strings.TrimSpace(path) == "" {
		return map[string]string{}, nil
	}
	raw, err := os.ReadFile(path)
	if err != nil {
		return nil, err
	}
	loaded := map[string]string{}
	if err := json.Unmarshal(raw, &loaded); err != nil {
		return nil, err
	}
	out := map[string]string{}
	for k, v := range loaded {
		rel := filepath.ToSlash(strings.TrimSpace(k))
		lic := strings.TrimSpace(v)
		if rel != "" && lic != "" {
			out[rel] = lic
		}
	}
	return out, nil
}

func isHeaderExt(path string, exts map[string]struct{}) bool {
	ext := strings.ToLower(filepath.Ext(path))
	_, ok := exts[ext]
	return ok
}

func removeSPDXFromNonTargetFiles(cfg config) (int, error) {
	rootAbs, _ := filepath.Abs(cfg.repoRoot)
	removed := 0
	err := filepath.WalkDir(rootAbs, func(path string, d fs.DirEntry, walkErr error) error {
		if walkErr != nil {
			return walkErr
		}
		if d.IsDir() {
			if _, skip := cfg.excludedDirs[d.Name()]; skip {
				return filepath.SkipDir
			}
			return nil
		}
		rel, err := filepath.Rel(rootAbs, path)
		if err != nil {
			return err
		}
		rel = filepath.ToSlash(rel)
		if shouldSkipFile(cfg, rel, path) {
			return nil
		}
		if isHeaderExt(path, cfg.headerExts) {
			return nil
		}
		changed, err := removeSPDXLine(path)
		if err != nil {
			return err
		}
		if changed {
			removed++
		}
		return nil
	})
	return removed, err
}

func removeSPDXLine(path string) (bool, error) {
	raw, err := os.ReadFile(path)
	if err != nil {
		return false, err
	}
	text := string(raw)
	lines := strings.Split(text, "\n")
	maxScan := 25
	if len(lines) < maxScan {
		maxScan = len(lines)
	}
	idx := -1
	for i := 0; i < maxScan; i++ {
		if strings.Contains(lines[i], "SPDX-License-Identifier:") {
			idx = i
			break
		}
	}
	if idx < 0 {
		return false, nil
	}
	out := append([]string{}, lines[:idx]...)
	if idx+1 < len(lines) && strings.TrimSpace(lines[idx+1]) == "" {
		out = append(out, lines[idx+2:]...)
	} else {
		out = append(out, lines[idx+1:]...)
	}
	newText := strings.Join(out, "\n")
	if newText == text {
		return false, nil
	}
	return true, os.WriteFile(path, []byte(newText), 0o644)
}

func upsertSPDX(path, spdx string, replaceExisting bool) (bool, string, error) {
	ext := strings.ToLower(filepath.Ext(path))
	style, ok := commentStyleByExt[ext]
	if !ok {
		return false, "", nil
	}
	raw, err := os.ReadFile(path)
	if err != nil {
		return false, "", err
	}
	text := string(raw)
	lines := strings.Split(text, "\n")
	existingIdx := -1
	existingValue := ""
	maxScan := 25
	if len(lines) < maxScan {
		maxScan = len(lines)
	}
	for i := 0; i < maxScan; i++ {
		if strings.Contains(lines[i], "SPDX-License-Identifier:") {
			existingIdx = i
			m := spdxPattern.FindStringSubmatch(lines[i])
			if len(m) > 1 {
				existingValue = strings.TrimSpace(m[1])
			}
			break
		}
	}
	spdxLine := formatSPDX(style, spdx)
	if existingIdx >= 0 {
		if !replaceExisting {
			return false, existingValue, nil
		}
		lines[existingIdx] = spdxLine
	} else {
		insertAt := 0
		if len(lines) > 0 && strings.HasPrefix(lines[0], "#!") {
			insertAt = 1
		}
		if ext == ".py" && len(lines) > insertAt && strings.HasPrefix(lines[insertAt], "#") && strings.Contains(lines[insertAt], "coding") {
			insertAt++
		}
		prefix := append([]string{}, lines[:insertAt]...)
		suffix := append([]string{}, lines[insertAt:]...)
		lines = append(prefix, spdxLine, "")
		lines = append(lines, suffix...)
	}
	newText := strings.Join(lines, "\n")
	if newText == text {
		return false, existingValue, nil
	}
	return true, existingValue, os.WriteFile(path, []byte(newText), 0o644)
}

func formatSPDX(style, spdx string) string {
	switch style {
	case "hash":
		return "# SPDX-License-Identifier: " + spdx
	case "line":
		return "// SPDX-License-Identifier: " + spdx
	case "block":
		return "/* SPDX-License-Identifier: " + spdx + " */"
	case "html":
		return "<!-- SPDX-License-Identifier: " + spdx + " -->"
	default:
		return "// SPDX-License-Identifier: " + spdx
	}
}

func formatCountSummary(counts map[string]int, total float64) string {
	keys := make([]string, 0, len(counts))
	for k := range counts {
		keys = append(keys, k)
	}
	sort.Strings(keys)
	parts := make([]string, 0, len(keys))
	for _, k := range keys {
		parts = append(parts, fmt.Sprintf("%s: %d (%.2f%%)", k, counts[k], pct(counts[k], total)))
	}
	return strings.Join(parts, ", ")
}

func shouldExplainPath(relPath string, explainAll bool, explainPaths map[string]struct{}) bool {
	if explainAll {
		return true
	}
	_, ok := explainPaths[filepath.ToSlash(relPath)]
	return ok
}

func printExplanation(relPath string, match matchResult, license, decisionReason string, cfg config) {
	fmt.Printf("\n--- score explanation: %s ---\n", relPath)
	if match.upstreamRelPath == "" {
		fmt.Printf("upstream match: <none>\n")
	} else {
		fmt.Printf("upstream match: %s\n", match.upstreamRelPath)
	}
	fmt.Printf("direct path match: %v\n", match.directPathMatch)
	fmt.Printf("composite similarity: %.2f%%\n", match.similarityPct)
	fmt.Printf("line_jaccard: %.2f%% (%d common / %d union)\n", match.lineJaccardPct, match.lineCommon, match.lineUnion)
	fmt.Printf("bigram_dice: %.2f%% (%d common / %d total)\n", match.bigramDicePct, match.bigramCommon, match.bigramTotal)
	fmt.Printf("shingle5_jaccard: %.2f%% (%d common / %d union)\n", match.shingle5Pct, match.shingle5Common, match.shingle5Union)
	fmt.Printf("shingle7_jaccard: %.2f%% (%d common / %d union)\n", match.shingle7Pct, match.shingle7Common, match.shingle7Union)
	fmt.Printf(
		"thresholds: mit=%.2f%% mixed=%.2f%% pure_0bsd=%.2f%% allow_pure_0bsd=%v\n",
		cfg.mitThreshold*100.0,
		cfg.mixedThreshold*100.0,
		cfg.pure0BSDThreshold,
		cfg.allowPure0BSD,
	)
	fmt.Printf("decision: %s (%s)\n", license, decisionReason)
	fmt.Printf("--- end explanation ---\n")
}

func round2(v float64) float64 {
	return float64(int(v*100+0.5)) / 100
}

func pct(count int, total float64) float64 {
	if total <= 0 {
		return 0
	}
	return float64(count) / total * 100.0
}

// emitSafetyNotices prints stderr warnings for flag combinations that can
// silently relicense or strip third-party license markers. The pure-0BSD
// branch in classify also requires the explicit confirm flag, so this
// function reminds the operator when --allow-pure-0bsd is set without it.
func emitSafetyNotices(cfg config) {
	if cfg.allowPure0BSD && !cfg.pure0BSDConfirmNoDerivation {
		fmt.Fprintln(os.Stderr,
			"warning: --allow-pure-0bsd is set but --pure-0bsd-confirm-no-derivation is not; "+
				"pure 0BSD classification will not be applied. Textual similarity is not a legal "+
				"test for derivative-work status.")
	}
	if cfg.allowPure0BSD && cfg.pure0BSDConfirmNoDerivation {
		fmt.Fprintln(os.Stderr,
			"warning: pure-0BSD classification is enabled. Files with low similarity and no "+
				"direct path match will be labeled 0BSD. Confirm those files are not derivative "+
				"works of upstream sources before publishing.")
	}
	if cfg.cleanNonTarget {
		fmt.Fprintln(os.Stderr,
			"warning: --clean-non-target-spdx is set; SPDX headers in files outside header-exts "+
				"will be removed. Audit the diff before committing to avoid stripping third-party "+
				"license markers in vendored or generated files.")
	}
	if cfg.replaceExisting {
		fmt.Fprintln(os.Stderr,
			"warning: --replace-existing is set; existing SPDX headers will be overwritten by "+
				"the classifier output. Review the report-json before committing.")
	}
}

func exitErr(err error) {
	_, _ = fmt.Fprintf(os.Stderr, "Error: %v\n", err)
	os.Exit(1)
}

func init() {
	_ = runtime.GOMAXPROCS(runtime.NumCPU())
}