// SPDX-License-Identifier: 0BSD // // license_scope_mapper analyzes file similarity against an upstream repository // and emits per-file SPDX license recommendations with optional header updates. // // Quick start: // // go run scripts/license_scope_mapper.go \ // --repo-root "/run/media/user1/projects/reticulum-meshchatX" \ // --overrides "scripts/license_scope_overrides.json" \ // --write-headers --replace-existing // // Environment variables are supported with the LICENSE_SCOPE_ prefix // (for example LICENSE_SCOPE_SCAN_EXTS and LICENSE_SCOPE_HEADER_EXTS). // // Important caveats (read before relying on output): // // 1. Textual similarity is not a legal test for derivative-work status. A file // forked from upstream and heavily refactored or renamed is still a // derivative work and still carries upstream license obligations. The // pure-0BSD branch is therefore opt-in (--allow-pure-0bsd) and gated // behind --pure-0bsd-confirm-no-derivation to avoid accidental // relicensing of upstream-derived material. // // 2. --clean-non-target-spdx and --replace-existing rewrite or delete // SPDX markers in tracked files. Run with --report-json first and // review the diff before enabling either flag; third-party SPDX lines // in vendored or generated files must not be silently stripped. // // 3. The default base license is "0BSD AND MIT", which means BOTH licenses // apply simultaneously to their respective contributions. This is the // safe default for files where provenance cannot be cleanly separated. package main import ( "bufio" "bytes" "context" "crypto/sha256" "encoding/hex" "encoding/json" "errors" "flag" "fmt" "hash/fnv" "io/fs" "os" "os/exec" "path/filepath" "regexp" "runtime" "sort" "strconv" "strings" "time" ) const ( envPrefix = "LICENSE_SCOPE_" defaultUpstreamURL = "https://github.com/liamcottle/reticulum-meshchat" ) var spdxPattern = regexp.MustCompile(`SPDX-License-Identifier:\s*([A-Za-z0-9.\-+ ()]+)`) type config struct { repoRoot string upstreamPath string upstreamURL string upstreamBranch string cloneTimeout time.Duration scanExtsRaw string headerExtsRaw string excludeDirsRaw string excludePathsRaw string maxFileSizeBytes int64 mitThreshold float64 mixedThreshold float64 allowPure0BSD bool pure0BSDConfirmNoDerivation bool pure0BSDThreshold float64 baseLicense string maxNameCandidates int writeHeaders bool replaceExisting bool cleanNonTarget bool overridesPath string reportJSON string explainPathsRaw string explainAll bool scanExts map[string]struct{} headerExts map[string]struct{} excludedDirs map[string]struct{} excludedContains []string explainPaths map[string]struct{} } type fileData struct { absPath string relPath string baseName string normHash string lineSet map[string]struct{} bigramSet map[uint64]struct{} shingle5Set map[uint64]struct{} shingle7Set map[uint64]struct{} } type matchResult struct { upstreamRelPath string directPathMatch bool similarityPct float64 lineJaccardPct float64 bigramDicePct float64 shingle5Pct float64 shingle7Pct float64 lineCommon int lineUnion int bigramCommon int bigramTotal int shingle5Common int shingle5Union int shingle7Common int shingle7Union int } type reportItem struct { Path string `json:"path"` License string `json:"license"` SimilarityPct float64 `json:"similarity_pct"` LineJaccardPct float64 `json:"line_jaccard_pct"` BigramDicePct float64 `json:"bigram_dice_pct"` Shingle5Pct float64 `json:"shingle5_jaccard_pct"` Shingle7Pct float64 `json:"shingle7_jaccard_pct"` OriginalPct float64 `json:"original_pct"` MinePct float64 `json:"mine_pct"` UpstreamMatch string `json:"upstream_match"` DirectPathMatch bool `json:"direct_path_match"` HeaderChanged bool `json:"header_changed"` ExistingSPDX string `json:"existing_spdx,omitempty"` } type reportConfig struct { RepoRoot string `json:"repo_root"` UpstreamURL string `json:"upstream_url"` UpstreamBranch string `json:"upstream_branch"` ScanExtensions string `json:"scan_extensions"` HeaderExtensions string `json:"header_extensions"` MitThreshold float64 `json:"mit_threshold"` MixedThreshold float64 `json:"mixed_threshold"` AllowPure0BSD bool `json:"allow_pure_0bsd"` Pure0BSDConfirmNoDerivation bool `json:"pure_0bsd_confirm_no_derivation"` Pure0BSDThreshold float64 `json:"pure_0bsd_threshold_pct"` BaseLicense string `json:"base_license"` MaxNameCandidates int `json:"max_name_candidates"` WriteHeaders bool `json:"write_headers"` ReplaceExisting bool `json:"replace_existing"` CleanNonTarget bool `json:"clean_non_target_spdx"` OverridesPath string `json:"overrides_path,omitempty"` } type reportSummary struct { FilesAnalyzed int `json:"files_analyzed"` HeadersUpdated int `json:"headers_updated"` RemovedNonTargetSPDX int `json:"removed_non_target_spdx"` LicenseCounts map[string]int `json:"license_counts"` } type reportDocument struct { GeneratedAt string `json:"generated_at"` Config reportConfig `json:"config"` Summary reportSummary `json:"summary"` Results []reportItem `json:"results"` } var commentStyleByExt = map[string]string{ ".py": "hash", ".sh": "hash", ".bash": "hash", ".zsh": "hash", ".yml": "hash", ".yaml": "hash", ".toml": "hash", ".ini": "hash", ".cfg": "hash", ".conf": "hash", ".mk": "hash", ".js": "line", ".ts": "line", ".jsx": "line", ".tsx": "line", ".c": "line", ".h": "line", ".cpp": "line", ".hpp": "line", ".java": "line", ".go": "line", ".rs": "line", ".swift": "line", ".kt": "line", ".css": "block", ".scss": "block", ".less": "block", ".html": "html", ".xml": "html", ".vue": "html", ".md": "html", } func main() { cfg, err := parseConfig() if err != nil { exitErr(err) } emitSafetyNotices(cfg) if cfg.cleanNonTarget { removed, err := removeSPDXFromNonTargetFiles(cfg) if err != nil { exitErr(err) } fmt.Printf("Removed SPDX headers from non-target files: %d\n", removed) } upstreamRoot := cfg.upstreamPath cleanup := func() {} if strings.TrimSpace(upstreamRoot) == "" { tmpDir, err := os.MkdirTemp("", "license-scope-upstream-*") if err != nil { exitErr(err) } cleanup = func() { _ = os.RemoveAll(tmpDir) } upstreamRoot = filepath.Join(tmpDir, "upstream") if err := cloneUpstream(cfg, upstreamRoot); err != nil { cleanup() exitErr(err) } } defer cleanup() repoRootAbs, _ := filepath.Abs(cfg.repoRoot) upstreamAbs, _ := filepath.Abs(upstreamRoot) localFiles, err := discoverFiles(cfg, repoRootAbs) if err != nil { exitErr(err) } overrides, err := loadOverrides(cfg.overridesPath) if err != nil { exitErr(err) } upstreamFiles, err := discoverFiles(cfg, upstreamAbs) if err != nil { exitErr(err) } upstreamByRel := make(map[string]fileData, len(upstreamFiles)) upstreamByBase := make(map[string][]fileData, len(upstreamFiles)) for _, f := range upstreamFiles { upstreamByRel[f.relPath] = f upstreamByBase[f.baseName] = append(upstreamByBase[f.baseName], f) } results := make([]reportItem, 0, len(localFiles)) classCounts := map[string]int{} headersUpdated := 0 removedNonTarget := 0 if cfg.cleanNonTarget { // We report cleanup count from function call side-effect by rescanning isn't needed. // Keep zero here and rely on printed output. } for _, local := range localFiles { match := findBestMatch(local, upstreamByRel, upstreamByBase, cfg.maxNameCandidates) license, decisionReason := classify(match, cfg) if forced, ok := overrides[local.relPath]; ok { license = forced decisionReason = "override file matched path" } classCounts[license]++ item := reportItem{ Path: local.relPath, License: license, SimilarityPct: round2(match.similarityPct), LineJaccardPct: round2(match.lineJaccardPct), BigramDicePct: round2(match.bigramDicePct), Shingle5Pct: round2(match.shingle5Pct), Shingle7Pct: round2(match.shingle7Pct), OriginalPct: round2(match.similarityPct), MinePct: round2(100.0 - match.similarityPct), UpstreamMatch: match.upstreamRelPath, DirectPathMatch: match.directPathMatch, } if cfg.writeHeaders && isHeaderExt(local.relPath, cfg.headerExts) { changed, existing, err := upsertSPDX(local.absPath, license, cfg.replaceExisting) if err != nil { fmt.Fprintf(os.Stderr, "SPDX update failed for %s: %v\n", local.relPath, err) } else { item.HeaderChanged = changed item.ExistingSPDX = existing if changed { headersUpdated++ } } } if shouldExplainPath(local.relPath, cfg.explainAll, cfg.explainPaths) { printExplanation(local.relPath, match, license, decisionReason, cfg) } results = append(results, item) } sort.Slice(results, func(i, j int) bool { return results[i].Path < results[j].Path }) doc := reportDocument{ GeneratedAt: time.Now().UTC().Format(time.RFC3339), Config: reportConfig{ RepoRoot: repoRootAbs, UpstreamURL: cfg.upstreamURL, UpstreamBranch: cfg.upstreamBranch, ScanExtensions: cfg.scanExtsRaw, HeaderExtensions: cfg.headerExtsRaw, MitThreshold: cfg.mitThreshold, MixedThreshold: cfg.mixedThreshold, AllowPure0BSD: cfg.allowPure0BSD, Pure0BSDConfirmNoDerivation: cfg.pure0BSDConfirmNoDerivation, Pure0BSDThreshold: cfg.pure0BSDThreshold, BaseLicense: cfg.baseLicense, MaxNameCandidates: cfg.maxNameCandidates, WriteHeaders: cfg.writeHeaders, ReplaceExisting: cfg.replaceExisting, CleanNonTarget: cfg.cleanNonTarget, OverridesPath: cfg.overridesPath, }, Summary: reportSummary{ FilesAnalyzed: len(results), HeadersUpdated: headersUpdated, RemovedNonTargetSPDX: removedNonTarget, LicenseCounts: classCounts, }, Results: results, } reportPath := filepath.Join(repoRootAbs, cfg.reportJSON) reportBytes, err := json.MarshalIndent(doc, "", " ") if err != nil { exitErr(err) } if err := os.WriteFile(reportPath, append(reportBytes, '\n'), 0o644); err != nil { exitErr(err) } fmt.Printf("Analyzed files: %d\n", len(results)) fmt.Printf("License counts -> %s\n", formatCountSummary(classCounts, float64(len(results)))) fmt.Printf("JSON report: %s\n", reportPath) if cfg.writeHeaders { fmt.Printf("Headers updated (%s): %d\n", cfg.headerExtsRaw, headersUpdated) } } func parseConfig() (config, error) { cfg := config{} flag.StringVar(&cfg.repoRoot, "repo-root", envString("REPO_ROOT", "."), "Path to repository root") flag.StringVar(&cfg.upstreamPath, "upstream-path", envString("UPSTREAM_PATH", ""), "Path to local upstream checkout") flag.StringVar(&cfg.upstreamURL, "upstream-url", envString("UPSTREAM_URL", defaultUpstreamURL), "Upstream git URL") flag.StringVar(&cfg.upstreamBranch, "upstream-branch", envString("UPSTREAM_BRANCH", "master"), "Upstream branch") flag.DurationVar(&cfg.cloneTimeout, "clone-timeout", envDuration("CLONE_TIMEOUT", 2*time.Minute), "Timeout for upstream clone") flag.StringVar(&cfg.scanExtsRaw, "scan-exts", envString("SCAN_EXTS", ".py,.vue"), "Comma-separated extensions for similarity analysis") flag.StringVar(&cfg.headerExtsRaw, "header-exts", envString("HEADER_EXTS", ".py,.vue"), "Comma-separated extensions eligible for SPDX headers") flag.StringVar(&cfg.excludeDirsRaw, "exclude-dirs", envString("EXCLUDE_DIRS", ".git,.idea,.vscode,.local,.pnpm-store,.flatpak-builder,node_modules,dist,build,.venv,venv,__pycache__,.pytest_cache,.mypy_cache,.ruff_cache"), "Comma-separated directories to skip") flag.StringVar(&cfg.excludePathsRaw, "exclude-path-contains", envString("EXCLUDE_PATH_CONTAINS", ""), "Comma-separated path substrings to skip") flag.Int64Var(&cfg.maxFileSizeBytes, "max-file-size-bytes", envInt64("MAX_FILE_SIZE_BYTES", 2_000_000), "Max file size for analysis") flag.Float64Var(&cfg.mitThreshold, "mit-threshold", envFloat("MIT_THRESHOLD", 0.85), "Composite similarity threshold for MIT") flag.Float64Var(&cfg.mixedThreshold, "mixed-threshold", envFloat("MIXED_THRESHOLD", 0.25), "Composite similarity threshold for mixed license") flag.BoolVar(&cfg.allowPure0BSD, "allow-pure-0bsd", envBool("ALLOW_PURE_0BSD", false), "Allow pure 0BSD classification for files with very low similarity and no direct path match. Disabled by default because textual similarity is not a legal test for derivative-work status.") flag.BoolVar(&cfg.pure0BSDConfirmNoDerivation, "pure-0bsd-confirm-no-derivation", envBool("PURE_0BSD_CONFIRM_NO_DERIVATION", false), "Required acknowledgement that files classified as pure 0BSD are not derivative works of upstream sources. Without this flag --allow-pure-0bsd is ignored.") flag.Float64Var(&cfg.pure0BSDThreshold, "pure-0bsd-threshold", envFloat("PURE_0BSD_THRESHOLD", 1.0), "Max similarity percent for pure 0BSD") flag.StringVar(&cfg.baseLicense, "base-license", envString("BASE_LICENSE", "0BSD AND MIT"), "Default SPDX license below mixed threshold") flag.IntVar(&cfg.maxNameCandidates, "max-name-candidates", envInt("MAX_NAME_CANDIDATES", 200), "Max basename candidates when direct path match missing") flag.BoolVar(&cfg.writeHeaders, "write-headers", envBool("WRITE_HEADERS", false), "Write SPDX headers") flag.BoolVar(&cfg.replaceExisting, "replace-existing", envBool("REPLACE_EXISTING", false), "Replace existing SPDX header") flag.BoolVar(&cfg.cleanNonTarget, "clean-non-target-spdx", envBool("CLEAN_NON_TARGET_SPDX", false), "Remove SPDX headers from files outside header-exts") flag.StringVar(&cfg.overridesPath, "overrides", envString("OVERRIDES", ""), "Path to JSON map of relpath->SPDX") flag.StringVar(&cfg.reportJSON, "report-json", envString("REPORT_JSON", "license-scope-report.json"), "JSON report output path") flag.StringVar(&cfg.explainPathsRaw, "explain-paths", envString("EXPLAIN_PATHS", ""), "Comma-separated repo-relative paths to explain scoring decisions") flag.BoolVar(&cfg.explainAll, "explain-all", envBool("EXPLAIN_ALL", false), "Print scoring explanation for every analyzed file") flag.Parse() cfg.scanExts = parseExtSet(cfg.scanExtsRaw) cfg.headerExts = parseExtSet(cfg.headerExtsRaw) cfg.excludedDirs = parseStringSet(cfg.excludeDirsRaw) cfg.excludedContains = parseStringList(cfg.excludePathsRaw) cfg.explainPaths = parsePathSet(cfg.explainPathsRaw) if len(cfg.scanExts) == 0 { return cfg, errors.New("scan-exts must include at least one extension") } if len(cfg.headerExts) == 0 { return cfg, errors.New("header-exts must include at least one extension") } if cfg.mixedThreshold < 0 || cfg.mitThreshold < 0 || cfg.mitThreshold > 1 || cfg.mixedThreshold > cfg.mitThreshold { return cfg, errors.New("thresholds must satisfy 0 <= mixed-threshold <= mit-threshold <= 1") } if cfg.pure0BSDThreshold < 0 || cfg.pure0BSDThreshold > 100 { return cfg, errors.New("pure-0bsd-threshold must be in [0,100] percent") } return cfg, nil } func envKey(name string) string { return envPrefix + name } func envString(name, fallback string) string { if v, ok := os.LookupEnv(envKey(name)); ok { return v } return fallback } func envBool(name string, fallback bool) bool { if v, ok := os.LookupEnv(envKey(name)); ok { b, err := strconv.ParseBool(strings.TrimSpace(v)) if err == nil { return b } } return fallback } func envInt(name string, fallback int) int { if v, ok := os.LookupEnv(envKey(name)); ok { i, err := strconv.Atoi(strings.TrimSpace(v)) if err == nil { return i } } return fallback } func envInt64(name string, fallback int64) int64 { if v, ok := os.LookupEnv(envKey(name)); ok { i, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64) if err == nil { return i } } return fallback } func envFloat(name string, fallback float64) float64 { if v, ok := os.LookupEnv(envKey(name)); ok { f, err := strconv.ParseFloat(strings.TrimSpace(v), 64) if err == nil { return f } } return fallback } func envDuration(name string, fallback time.Duration) time.Duration { if v, ok := os.LookupEnv(envKey(name)); ok { d, err := time.ParseDuration(strings.TrimSpace(v)) if err == nil { return d } } return fallback } func parseStringList(raw string) []string { out := []string{} for _, token := range strings.Split(raw, ",") { t := strings.TrimSpace(token) if t != "" { out = append(out, t) } } return out } func parseStringSet(raw string) map[string]struct{} { out := map[string]struct{}{} for _, t := range parseStringList(raw) { out[t] = struct{}{} } return out } func parsePathSet(raw string) map[string]struct{} { out := map[string]struct{}{} for _, p := range parseStringList(raw) { out[filepath.ToSlash(strings.TrimSpace(p))] = struct{}{} } return out } func parseExtSet(raw string) map[string]struct{} { out := map[string]struct{}{} for _, token := range parseStringList(raw) { ext := strings.ToLower(token) if !strings.HasPrefix(ext, ".") { ext = "." + ext } out[ext] = struct{}{} } return out } func cloneUpstream(cfg config, target string) error { ctx, cancel := context.WithTimeout(context.Background(), cfg.cloneTimeout) defer cancel() cmd := exec.CommandContext(ctx, "git", "clone", "--depth", "1", "--branch", cfg.upstreamBranch, cfg.upstreamURL, target) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { if ctx.Err() == context.DeadlineExceeded { return fmt.Errorf("git clone timed out after %s", cfg.cloneTimeout) } return err } return nil } func discoverFiles(cfg config, root string) ([]fileData, error) { out := []fileData{} err := filepath.WalkDir(root, func(path string, d fs.DirEntry, walkErr error) error { if walkErr != nil { return walkErr } name := d.Name() if d.IsDir() { if _, skip := cfg.excludedDirs[name]; skip { return filepath.SkipDir } return nil } rel, err := filepath.Rel(root, path) if err != nil { return err } rel = filepath.ToSlash(rel) if shouldSkipFile(cfg, rel, path) { return nil } normHash, lineSet, bigrams, s5, s7, err := loadAndNormalize(path) if err != nil { return nil } out = append(out, fileData{ absPath: path, relPath: rel, baseName: filepath.Base(path), normHash: normHash, lineSet: lineSet, bigramSet: bigrams, shingle5Set: s5, shingle7Set: s7, }) return nil }) return out, err } func shouldSkipFile(cfg config, rel, abs string) bool { for _, part := range cfg.excludedContains { if strings.Contains(rel, part) { return true } } ext := strings.ToLower(filepath.Ext(abs)) if _, ok := cfg.scanExts[ext]; !ok { return true } if _, ok := commentStyleByExt[ext]; !ok { return true } info, err := os.Stat(abs) if err != nil { return true } if info.Size() > cfg.maxFileSizeBytes { return true } return false } func loadAndNormalize(path string) (string, map[string]struct{}, map[uint64]struct{}, map[uint64]struct{}, map[uint64]struct{}, error) { raw, err := os.ReadFile(path) if err != nil { return "", nil, nil, nil, nil, err } if bytes.IndexByte(raw, 0) >= 0 { return "", nil, nil, nil, nil, errors.New("binary file") } sc := bufio.NewScanner(bytes.NewReader(raw)) sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024) lines := []string{} lineSet := map[string]struct{}{} for sc.Scan() { line := sc.Text() if strings.Contains(line, "SPDX-License-Identifier:") { continue } norm := strings.ToLower(strings.TrimSpace(line)) if norm == "" { continue } lines = append(lines, norm) lineSet[norm] = struct{}{} } if err := sc.Err(); err != nil { return "", nil, nil, nil, nil, err } sum := sha256.Sum256([]byte(strings.Join(lines, "\n"))) return hex.EncodeToString(sum[:]), lineSet, makeNGramSet(lines, 2), makeNGramSet(lines, 5), makeNGramSet(lines, 7), nil } func findBestMatch(local fileData, byRel map[string]fileData, byBase map[string][]fileData, maxCandidates int) matchResult { best := matchResult{} if direct, ok := byRel[local.relPath]; ok { best = similarityDetail(local, direct) best.upstreamRelPath = direct.relPath best.directPathMatch = true } candidates := byBase[local.baseName] if len(candidates) > maxCandidates { candidates = candidates[:maxCandidates] } for _, cand := range candidates { if cand.relPath == best.upstreamRelPath { continue } d := similarityDetail(local, cand) if d.similarityPct > best.similarityPct { d.upstreamRelPath = cand.relPath d.directPathMatch = cand.relPath == local.relPath best = d } } if best.upstreamRelPath == local.relPath && best.upstreamRelPath != "" { best.directPathMatch = true } return best } func similarityDetail(a, b fileData) matchResult { if a.normHash != "" && a.normHash == b.normHash { return matchResult{ similarityPct: 100, lineJaccardPct: 100, bigramDicePct: 100, shingle5Pct: 100, shingle7Pct: 100, lineCommon: len(a.lineSet), lineUnion: len(a.lineSet), bigramCommon: len(a.bigramSet), bigramTotal: len(a.bigramSet) + len(b.bigramSet), shingle5Common: len(a.shingle5Set), shingle5Union: len(a.shingle5Set), shingle7Common: len(a.shingle7Set), shingle7Union: len(a.shingle7Set), } } lineJ, lineCommon, lineUnion := setJaccardPctWithCounts(a.lineSet, b.lineSet) biD, biCommon, biTotal := setDicePctWithCounts(a.bigramSet, b.bigramSet) sh5, sh5Common, sh5Union := setJaccardPctWithCounts(a.shingle5Set, b.shingle5Set) sh7, sh7Common, sh7Union := setJaccardPctWithCounts(a.shingle7Set, b.shingle7Set) composite := combineComposite(lineJ, biD, sh5, sh7, len(a.bigramSet) > 0 && len(b.bigramSet) > 0, len(a.shingle5Set) > 0 && len(b.shingle5Set) > 0, len(a.shingle7Set) > 0 && len(b.shingle7Set) > 0) return matchResult{ similarityPct: composite, lineJaccardPct: lineJ, bigramDicePct: biD, shingle5Pct: sh5, shingle7Pct: sh7, lineCommon: lineCommon, lineUnion: lineUnion, bigramCommon: biCommon, bigramTotal: biTotal, shingle5Common: sh5Common, shingle5Union: sh5Union, shingle7Common: sh7Common, shingle7Union: sh7Union, } } // classify maps a similarity result to an SPDX expression. The pure-0BSD // branch requires both --allow-pure-0bsd and --pure-0bsd-confirm-no-derivation // because textual similarity alone cannot prove non-derivation; renamed or // heavily refactored forks of upstream files retain MIT obligations. func classify(match matchResult, cfg config) (string, string) { sim := match.similarityPct / 100.0 if sim >= cfg.mitThreshold { return "MIT", fmt.Sprintf("composite similarity %.2f%% >= MIT threshold %.2f%%", match.similarityPct, cfg.mitThreshold*100.0) } if cfg.allowPure0BSD && cfg.pure0BSDConfirmNoDerivation && !match.directPathMatch && match.similarityPct <= cfg.pure0BSDThreshold { return "0BSD", fmt.Sprintf("low similarity %.2f%% <= pure-0bsd threshold %.2f%% and no direct path match (operator confirmed non-derivation)", match.similarityPct, cfg.pure0BSDThreshold) } if sim >= cfg.mixedThreshold { return "0BSD AND MIT", fmt.Sprintf("composite similarity %.2f%% >= mixed threshold %.2f%%", match.similarityPct, cfg.mixedThreshold*100.0) } return cfg.baseLicense, fmt.Sprintf("below mixed threshold; fallback to base license %s", cfg.baseLicense) } func makeNGramSet(lines []string, n int) map[uint64]struct{} { out := map[uint64]struct{}{} if n <= 0 || len(lines) < n { return out } for i := 0; i <= len(lines)-n; i++ { out[hashNGram(lines[i:i+n])] = struct{}{} } return out } func hashNGram(parts []string) uint64 { h := fnv.New64a() for i, p := range parts { _, _ = h.Write([]byte(p)) if i+1 < len(parts) { _, _ = h.Write([]byte{0}) } } return h.Sum64() } func combineComposite(lineJ, biD, sh5, sh7 float64, hasBi, hasS5, hasS7 bool) float64 { type part struct { score float64 weight float64 ok bool } parts := []part{ {lineJ, 0.20, true}, {biD, 0.30, hasBi}, {sh5, 0.30, hasS5}, {sh7, 0.20, hasS7}, } weighted := 0.0 total := 0.0 for _, p := range parts { if !p.ok { continue } weighted += p.score * p.weight total += p.weight } if total <= 0 { return 0 } return weighted / total } func setJaccardPctWithCounts[T comparable](a, b map[T]struct{}) (float64, int, int) { if len(a) == 0 || len(b) == 0 { return 0, 0, len(a) + len(b) } inter := 0 for k := range a { if _, ok := b[k]; ok { inter++ } } union := len(a) + len(b) - inter if union <= 0 { return 0, inter, union } return float64(inter) / float64(union) * 100.0, inter, union } func setDicePctWithCounts[T comparable](a, b map[T]struct{}) (float64, int, int) { if len(a) == 0 || len(b) == 0 { return 0, 0, len(a) + len(b) } inter := 0 for k := range a { if _, ok := b[k]; ok { inter++ } } den := len(a) + len(b) if den <= 0 { return 0, inter, den } return 2.0 * float64(inter) / float64(den) * 100.0, inter, den } func loadOverrides(path string) (map[string]string, error) { if strings.TrimSpace(path) == "" { return map[string]string{}, nil } raw, err := os.ReadFile(path) if err != nil { return nil, err } loaded := map[string]string{} if err := json.Unmarshal(raw, &loaded); err != nil { return nil, err } out := map[string]string{} for k, v := range loaded { rel := filepath.ToSlash(strings.TrimSpace(k)) lic := strings.TrimSpace(v) if rel != "" && lic != "" { out[rel] = lic } } return out, nil } func isHeaderExt(path string, exts map[string]struct{}) bool { ext := strings.ToLower(filepath.Ext(path)) _, ok := exts[ext] return ok } func removeSPDXFromNonTargetFiles(cfg config) (int, error) { rootAbs, _ := filepath.Abs(cfg.repoRoot) removed := 0 err := filepath.WalkDir(rootAbs, func(path string, d fs.DirEntry, walkErr error) error { if walkErr != nil { return walkErr } if d.IsDir() { if _, skip := cfg.excludedDirs[d.Name()]; skip { return filepath.SkipDir } return nil } rel, err := filepath.Rel(rootAbs, path) if err != nil { return err } rel = filepath.ToSlash(rel) if shouldSkipFile(cfg, rel, path) { return nil } if isHeaderExt(path, cfg.headerExts) { return nil } changed, err := removeSPDXLine(path) if err != nil { return err } if changed { removed++ } return nil }) return removed, err } func removeSPDXLine(path string) (bool, error) { raw, err := os.ReadFile(path) if err != nil { return false, err } text := string(raw) lines := strings.Split(text, "\n") maxScan := 25 if len(lines) < maxScan { maxScan = len(lines) } idx := -1 for i := 0; i < maxScan; i++ { if strings.Contains(lines[i], "SPDX-License-Identifier:") { idx = i break } } if idx < 0 { return false, nil } out := append([]string{}, lines[:idx]...) if idx+1 < len(lines) && strings.TrimSpace(lines[idx+1]) == "" { out = append(out, lines[idx+2:]...) } else { out = append(out, lines[idx+1:]...) } newText := strings.Join(out, "\n") if newText == text { return false, nil } return true, os.WriteFile(path, []byte(newText), 0o644) } func upsertSPDX(path, spdx string, replaceExisting bool) (bool, string, error) { ext := strings.ToLower(filepath.Ext(path)) style, ok := commentStyleByExt[ext] if !ok { return false, "", nil } raw, err := os.ReadFile(path) if err != nil { return false, "", err } text := string(raw) lines := strings.Split(text, "\n") existingIdx := -1 existingValue := "" maxScan := 25 if len(lines) < maxScan { maxScan = len(lines) } for i := 0; i < maxScan; i++ { if strings.Contains(lines[i], "SPDX-License-Identifier:") { existingIdx = i m := spdxPattern.FindStringSubmatch(lines[i]) if len(m) > 1 { existingValue = strings.TrimSpace(m[1]) } break } } spdxLine := formatSPDX(style, spdx) if existingIdx >= 0 { if !replaceExisting { return false, existingValue, nil } lines[existingIdx] = spdxLine } else { insertAt := 0 if len(lines) > 0 && strings.HasPrefix(lines[0], "#!") { insertAt = 1 } if ext == ".py" && len(lines) > insertAt && strings.HasPrefix(lines[insertAt], "#") && strings.Contains(lines[insertAt], "coding") { insertAt++ } prefix := append([]string{}, lines[:insertAt]...) suffix := append([]string{}, lines[insertAt:]...) lines = append(prefix, spdxLine, "") lines = append(lines, suffix...) } newText := strings.Join(lines, "\n") if newText == text { return false, existingValue, nil } return true, existingValue, os.WriteFile(path, []byte(newText), 0o644) } func formatSPDX(style, spdx string) string { switch style { case "hash": return "# SPDX-License-Identifier: " + spdx case "line": return "// SPDX-License-Identifier: " + spdx case "block": return "/* SPDX-License-Identifier: " + spdx + " */" case "html": return "" default: return "// SPDX-License-Identifier: " + spdx } } func formatCountSummary(counts map[string]int, total float64) string { keys := make([]string, 0, len(counts)) for k := range counts { keys = append(keys, k) } sort.Strings(keys) parts := make([]string, 0, len(keys)) for _, k := range keys { parts = append(parts, fmt.Sprintf("%s: %d (%.2f%%)", k, counts[k], pct(counts[k], total))) } return strings.Join(parts, ", ") } func shouldExplainPath(relPath string, explainAll bool, explainPaths map[string]struct{}) bool { if explainAll { return true } _, ok := explainPaths[filepath.ToSlash(relPath)] return ok } func printExplanation(relPath string, match matchResult, license, decisionReason string, cfg config) { fmt.Printf("\n--- score explanation: %s ---\n", relPath) if match.upstreamRelPath == "" { fmt.Printf("upstream match: \n") } else { fmt.Printf("upstream match: %s\n", match.upstreamRelPath) } fmt.Printf("direct path match: %v\n", match.directPathMatch) fmt.Printf("composite similarity: %.2f%%\n", match.similarityPct) fmt.Printf("line_jaccard: %.2f%% (%d common / %d union)\n", match.lineJaccardPct, match.lineCommon, match.lineUnion) fmt.Printf("bigram_dice: %.2f%% (%d common / %d total)\n", match.bigramDicePct, match.bigramCommon, match.bigramTotal) fmt.Printf("shingle5_jaccard: %.2f%% (%d common / %d union)\n", match.shingle5Pct, match.shingle5Common, match.shingle5Union) fmt.Printf("shingle7_jaccard: %.2f%% (%d common / %d union)\n", match.shingle7Pct, match.shingle7Common, match.shingle7Union) fmt.Printf( "thresholds: mit=%.2f%% mixed=%.2f%% pure_0bsd=%.2f%% allow_pure_0bsd=%v\n", cfg.mitThreshold*100.0, cfg.mixedThreshold*100.0, cfg.pure0BSDThreshold, cfg.allowPure0BSD, ) fmt.Printf("decision: %s (%s)\n", license, decisionReason) fmt.Printf("--- end explanation ---\n") } func round2(v float64) float64 { return float64(int(v*100+0.5)) / 100 } func pct(count int, total float64) float64 { if total <= 0 { return 0 } return float64(count) / total * 100.0 } // emitSafetyNotices prints stderr warnings for flag combinations that can // silently relicense or strip third-party license markers. The pure-0BSD // branch in classify also requires the explicit confirm flag, so this // function reminds the operator when --allow-pure-0bsd is set without it. func emitSafetyNotices(cfg config) { if cfg.allowPure0BSD && !cfg.pure0BSDConfirmNoDerivation { fmt.Fprintln(os.Stderr, "warning: --allow-pure-0bsd is set but --pure-0bsd-confirm-no-derivation is not; "+ "pure 0BSD classification will not be applied. Textual similarity is not a legal "+ "test for derivative-work status.") } if cfg.allowPure0BSD && cfg.pure0BSDConfirmNoDerivation { fmt.Fprintln(os.Stderr, "warning: pure-0BSD classification is enabled. Files with low similarity and no "+ "direct path match will be labeled 0BSD. Confirm those files are not derivative "+ "works of upstream sources before publishing.") } if cfg.cleanNonTarget { fmt.Fprintln(os.Stderr, "warning: --clean-non-target-spdx is set; SPDX headers in files outside header-exts "+ "will be removed. Audit the diff before committing to avoid stripping third-party "+ "license markers in vendored or generated files.") } if cfg.replaceExisting { fmt.Fprintln(os.Stderr, "warning: --replace-existing is set; existing SPDX headers will be overwritten by "+ "the classifier output. Review the report-json before committing.") } } func exitErr(err error) { _, _ = fmt.Fprintf(os.Stderr, "Error: %v\n", err) os.Exit(1) } func init() { _ = runtime.GOMAXPROCS(runtime.NumCPU()) }