mirror of
https://git.quad4.io/RNS-Things/MeshChatX.git
synced 2026-04-24 12:55:53 +00:00
1054 lines
33 KiB
Go
1054 lines
33 KiB
Go
// SPDX-License-Identifier: 0BSD
|
|
//
|
|
// license_scope_mapper analyzes file similarity against an upstream repository
|
|
// and emits per-file SPDX license recommendations with optional header updates.
|
|
//
|
|
// Quick start:
|
|
//
|
|
// go run scripts/license_scope_mapper.go \
|
|
// --repo-root "/run/media/user1/projects/reticulum-meshchatX" \
|
|
// --overrides "scripts/license_scope_overrides.json" \
|
|
// --write-headers --replace-existing
|
|
//
|
|
// Environment variables are supported with the LICENSE_SCOPE_ prefix
|
|
// (for example LICENSE_SCOPE_SCAN_EXTS and LICENSE_SCOPE_HEADER_EXTS).
|
|
//
|
|
// Important caveats (read before relying on output):
|
|
//
|
|
// 1. Textual similarity is not a legal test for derivative-work status. A file
|
|
// forked from upstream and heavily refactored or renamed is still a
|
|
// derivative work and still carries upstream license obligations. The
|
|
// pure-0BSD branch is therefore opt-in (--allow-pure-0bsd) and gated
|
|
// behind --pure-0bsd-confirm-no-derivation to avoid accidental
|
|
// relicensing of upstream-derived material.
|
|
//
|
|
// 2. --clean-non-target-spdx and --replace-existing rewrite or delete
|
|
// SPDX markers in tracked files. Run with --report-json first and
|
|
// review the diff before enabling either flag; third-party SPDX lines
|
|
// in vendored or generated files must not be silently stripped.
|
|
//
|
|
// 3. The default base license is "0BSD AND MIT", which means BOTH licenses
|
|
// apply simultaneously to their respective contributions. This is the
|
|
// safe default for files where provenance cannot be cleanly separated.
|
|
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"hash/fnv"
|
|
"io/fs"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"regexp"
|
|
"runtime"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
envPrefix = "LICENSE_SCOPE_"
|
|
defaultUpstreamURL = "https://github.com/liamcottle/reticulum-meshchat"
|
|
)
|
|
|
|
var spdxPattern = regexp.MustCompile(`SPDX-License-Identifier:\s*([A-Za-z0-9.\-+ ()]+)`)
|
|
|
|
type config struct {
|
|
repoRoot string
|
|
upstreamPath string
|
|
upstreamURL string
|
|
upstreamBranch string
|
|
cloneTimeout time.Duration
|
|
scanExtsRaw string
|
|
headerExtsRaw string
|
|
excludeDirsRaw string
|
|
excludePathsRaw string
|
|
maxFileSizeBytes int64
|
|
mitThreshold float64
|
|
mixedThreshold float64
|
|
allowPure0BSD bool
|
|
pure0BSDConfirmNoDerivation bool
|
|
pure0BSDThreshold float64
|
|
baseLicense string
|
|
maxNameCandidates int
|
|
writeHeaders bool
|
|
replaceExisting bool
|
|
cleanNonTarget bool
|
|
overridesPath string
|
|
reportJSON string
|
|
explainPathsRaw string
|
|
explainAll bool
|
|
|
|
scanExts map[string]struct{}
|
|
headerExts map[string]struct{}
|
|
excludedDirs map[string]struct{}
|
|
excludedContains []string
|
|
explainPaths map[string]struct{}
|
|
}
|
|
|
|
type fileData struct {
|
|
absPath string
|
|
relPath string
|
|
baseName string
|
|
normHash string
|
|
lineSet map[string]struct{}
|
|
bigramSet map[uint64]struct{}
|
|
shingle5Set map[uint64]struct{}
|
|
shingle7Set map[uint64]struct{}
|
|
}
|
|
|
|
type matchResult struct {
|
|
upstreamRelPath string
|
|
directPathMatch bool
|
|
similarityPct float64
|
|
lineJaccardPct float64
|
|
bigramDicePct float64
|
|
shingle5Pct float64
|
|
shingle7Pct float64
|
|
lineCommon int
|
|
lineUnion int
|
|
bigramCommon int
|
|
bigramTotal int
|
|
shingle5Common int
|
|
shingle5Union int
|
|
shingle7Common int
|
|
shingle7Union int
|
|
}
|
|
|
|
type reportItem struct {
|
|
Path string `json:"path"`
|
|
License string `json:"license"`
|
|
SimilarityPct float64 `json:"similarity_pct"`
|
|
LineJaccardPct float64 `json:"line_jaccard_pct"`
|
|
BigramDicePct float64 `json:"bigram_dice_pct"`
|
|
Shingle5Pct float64 `json:"shingle5_jaccard_pct"`
|
|
Shingle7Pct float64 `json:"shingle7_jaccard_pct"`
|
|
OriginalPct float64 `json:"original_pct"`
|
|
MinePct float64 `json:"mine_pct"`
|
|
UpstreamMatch string `json:"upstream_match"`
|
|
DirectPathMatch bool `json:"direct_path_match"`
|
|
HeaderChanged bool `json:"header_changed"`
|
|
ExistingSPDX string `json:"existing_spdx,omitempty"`
|
|
}
|
|
|
|
type reportConfig struct {
|
|
RepoRoot string `json:"repo_root"`
|
|
UpstreamURL string `json:"upstream_url"`
|
|
UpstreamBranch string `json:"upstream_branch"`
|
|
ScanExtensions string `json:"scan_extensions"`
|
|
HeaderExtensions string `json:"header_extensions"`
|
|
MitThreshold float64 `json:"mit_threshold"`
|
|
MixedThreshold float64 `json:"mixed_threshold"`
|
|
AllowPure0BSD bool `json:"allow_pure_0bsd"`
|
|
Pure0BSDConfirmNoDerivation bool `json:"pure_0bsd_confirm_no_derivation"`
|
|
Pure0BSDThreshold float64 `json:"pure_0bsd_threshold_pct"`
|
|
BaseLicense string `json:"base_license"`
|
|
MaxNameCandidates int `json:"max_name_candidates"`
|
|
WriteHeaders bool `json:"write_headers"`
|
|
ReplaceExisting bool `json:"replace_existing"`
|
|
CleanNonTarget bool `json:"clean_non_target_spdx"`
|
|
OverridesPath string `json:"overrides_path,omitempty"`
|
|
}
|
|
|
|
type reportSummary struct {
|
|
FilesAnalyzed int `json:"files_analyzed"`
|
|
HeadersUpdated int `json:"headers_updated"`
|
|
RemovedNonTargetSPDX int `json:"removed_non_target_spdx"`
|
|
LicenseCounts map[string]int `json:"license_counts"`
|
|
}
|
|
|
|
type reportDocument struct {
|
|
GeneratedAt string `json:"generated_at"`
|
|
Config reportConfig `json:"config"`
|
|
Summary reportSummary `json:"summary"`
|
|
Results []reportItem `json:"results"`
|
|
}
|
|
|
|
var commentStyleByExt = map[string]string{
|
|
".py": "hash",
|
|
".sh": "hash",
|
|
".bash": "hash",
|
|
".zsh": "hash",
|
|
".yml": "hash",
|
|
".yaml": "hash",
|
|
".toml": "hash",
|
|
".ini": "hash",
|
|
".cfg": "hash",
|
|
".conf": "hash",
|
|
".mk": "hash",
|
|
".js": "line",
|
|
".ts": "line",
|
|
".jsx": "line",
|
|
".tsx": "line",
|
|
".c": "line",
|
|
".h": "line",
|
|
".cpp": "line",
|
|
".hpp": "line",
|
|
".java": "line",
|
|
".go": "line",
|
|
".rs": "line",
|
|
".swift": "line",
|
|
".kt": "line",
|
|
".css": "block",
|
|
".scss": "block",
|
|
".less": "block",
|
|
".html": "html",
|
|
".xml": "html",
|
|
".vue": "html",
|
|
".md": "html",
|
|
}
|
|
|
|
func main() {
|
|
cfg, err := parseConfig()
|
|
if err != nil {
|
|
exitErr(err)
|
|
}
|
|
|
|
emitSafetyNotices(cfg)
|
|
|
|
if cfg.cleanNonTarget {
|
|
removed, err := removeSPDXFromNonTargetFiles(cfg)
|
|
if err != nil {
|
|
exitErr(err)
|
|
}
|
|
fmt.Printf("Removed SPDX headers from non-target files: %d\n", removed)
|
|
}
|
|
|
|
upstreamRoot := cfg.upstreamPath
|
|
cleanup := func() {}
|
|
if strings.TrimSpace(upstreamRoot) == "" {
|
|
tmpDir, err := os.MkdirTemp("", "license-scope-upstream-*")
|
|
if err != nil {
|
|
exitErr(err)
|
|
}
|
|
cleanup = func() { _ = os.RemoveAll(tmpDir) }
|
|
upstreamRoot = filepath.Join(tmpDir, "upstream")
|
|
if err := cloneUpstream(cfg, upstreamRoot); err != nil {
|
|
cleanup()
|
|
exitErr(err)
|
|
}
|
|
}
|
|
defer cleanup()
|
|
|
|
repoRootAbs, _ := filepath.Abs(cfg.repoRoot)
|
|
upstreamAbs, _ := filepath.Abs(upstreamRoot)
|
|
|
|
localFiles, err := discoverFiles(cfg, repoRootAbs)
|
|
if err != nil {
|
|
exitErr(err)
|
|
}
|
|
overrides, err := loadOverrides(cfg.overridesPath)
|
|
if err != nil {
|
|
exitErr(err)
|
|
}
|
|
upstreamFiles, err := discoverFiles(cfg, upstreamAbs)
|
|
if err != nil {
|
|
exitErr(err)
|
|
}
|
|
|
|
upstreamByRel := make(map[string]fileData, len(upstreamFiles))
|
|
upstreamByBase := make(map[string][]fileData, len(upstreamFiles))
|
|
for _, f := range upstreamFiles {
|
|
upstreamByRel[f.relPath] = f
|
|
upstreamByBase[f.baseName] = append(upstreamByBase[f.baseName], f)
|
|
}
|
|
|
|
results := make([]reportItem, 0, len(localFiles))
|
|
classCounts := map[string]int{}
|
|
headersUpdated := 0
|
|
removedNonTarget := 0
|
|
if cfg.cleanNonTarget {
|
|
// We report cleanup count from function call side-effect by rescanning isn't needed.
|
|
// Keep zero here and rely on printed output.
|
|
}
|
|
|
|
for _, local := range localFiles {
|
|
match := findBestMatch(local, upstreamByRel, upstreamByBase, cfg.maxNameCandidates)
|
|
license, decisionReason := classify(match, cfg)
|
|
if forced, ok := overrides[local.relPath]; ok {
|
|
license = forced
|
|
decisionReason = "override file matched path"
|
|
}
|
|
classCounts[license]++
|
|
|
|
item := reportItem{
|
|
Path: local.relPath,
|
|
License: license,
|
|
SimilarityPct: round2(match.similarityPct),
|
|
LineJaccardPct: round2(match.lineJaccardPct),
|
|
BigramDicePct: round2(match.bigramDicePct),
|
|
Shingle5Pct: round2(match.shingle5Pct),
|
|
Shingle7Pct: round2(match.shingle7Pct),
|
|
OriginalPct: round2(match.similarityPct),
|
|
MinePct: round2(100.0 - match.similarityPct),
|
|
UpstreamMatch: match.upstreamRelPath,
|
|
DirectPathMatch: match.directPathMatch,
|
|
}
|
|
|
|
if cfg.writeHeaders && isHeaderExt(local.relPath, cfg.headerExts) {
|
|
changed, existing, err := upsertSPDX(local.absPath, license, cfg.replaceExisting)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "SPDX update failed for %s: %v\n", local.relPath, err)
|
|
} else {
|
|
item.HeaderChanged = changed
|
|
item.ExistingSPDX = existing
|
|
if changed {
|
|
headersUpdated++
|
|
}
|
|
}
|
|
}
|
|
|
|
if shouldExplainPath(local.relPath, cfg.explainAll, cfg.explainPaths) {
|
|
printExplanation(local.relPath, match, license, decisionReason, cfg)
|
|
}
|
|
|
|
results = append(results, item)
|
|
}
|
|
|
|
sort.Slice(results, func(i, j int) bool { return results[i].Path < results[j].Path })
|
|
|
|
doc := reportDocument{
|
|
GeneratedAt: time.Now().UTC().Format(time.RFC3339),
|
|
Config: reportConfig{
|
|
RepoRoot: repoRootAbs,
|
|
UpstreamURL: cfg.upstreamURL,
|
|
UpstreamBranch: cfg.upstreamBranch,
|
|
ScanExtensions: cfg.scanExtsRaw,
|
|
HeaderExtensions: cfg.headerExtsRaw,
|
|
MitThreshold: cfg.mitThreshold,
|
|
MixedThreshold: cfg.mixedThreshold,
|
|
AllowPure0BSD: cfg.allowPure0BSD,
|
|
Pure0BSDConfirmNoDerivation: cfg.pure0BSDConfirmNoDerivation,
|
|
Pure0BSDThreshold: cfg.pure0BSDThreshold,
|
|
BaseLicense: cfg.baseLicense,
|
|
MaxNameCandidates: cfg.maxNameCandidates,
|
|
WriteHeaders: cfg.writeHeaders,
|
|
ReplaceExisting: cfg.replaceExisting,
|
|
CleanNonTarget: cfg.cleanNonTarget,
|
|
OverridesPath: cfg.overridesPath,
|
|
},
|
|
Summary: reportSummary{
|
|
FilesAnalyzed: len(results),
|
|
HeadersUpdated: headersUpdated,
|
|
RemovedNonTargetSPDX: removedNonTarget,
|
|
LicenseCounts: classCounts,
|
|
},
|
|
Results: results,
|
|
}
|
|
|
|
reportPath := filepath.Join(repoRootAbs, cfg.reportJSON)
|
|
reportBytes, err := json.MarshalIndent(doc, "", " ")
|
|
if err != nil {
|
|
exitErr(err)
|
|
}
|
|
if err := os.WriteFile(reportPath, append(reportBytes, '\n'), 0o644); err != nil {
|
|
exitErr(err)
|
|
}
|
|
|
|
fmt.Printf("Analyzed files: %d\n", len(results))
|
|
fmt.Printf("License counts -> %s\n", formatCountSummary(classCounts, float64(len(results))))
|
|
fmt.Printf("JSON report: %s\n", reportPath)
|
|
if cfg.writeHeaders {
|
|
fmt.Printf("Headers updated (%s): %d\n", cfg.headerExtsRaw, headersUpdated)
|
|
}
|
|
}
|
|
|
|
func parseConfig() (config, error) {
|
|
cfg := config{}
|
|
flag.StringVar(&cfg.repoRoot, "repo-root", envString("REPO_ROOT", "."), "Path to repository root")
|
|
flag.StringVar(&cfg.upstreamPath, "upstream-path", envString("UPSTREAM_PATH", ""), "Path to local upstream checkout")
|
|
flag.StringVar(&cfg.upstreamURL, "upstream-url", envString("UPSTREAM_URL", defaultUpstreamURL), "Upstream git URL")
|
|
flag.StringVar(&cfg.upstreamBranch, "upstream-branch", envString("UPSTREAM_BRANCH", "master"), "Upstream branch")
|
|
flag.DurationVar(&cfg.cloneTimeout, "clone-timeout", envDuration("CLONE_TIMEOUT", 2*time.Minute), "Timeout for upstream clone")
|
|
flag.StringVar(&cfg.scanExtsRaw, "scan-exts", envString("SCAN_EXTS", ".py,.vue"), "Comma-separated extensions for similarity analysis")
|
|
flag.StringVar(&cfg.headerExtsRaw, "header-exts", envString("HEADER_EXTS", ".py,.vue"), "Comma-separated extensions eligible for SPDX headers")
|
|
flag.StringVar(&cfg.excludeDirsRaw, "exclude-dirs", envString("EXCLUDE_DIRS", ".git,.idea,.vscode,.local,.pnpm-store,.flatpak-builder,node_modules,dist,build,.venv,venv,__pycache__,.pytest_cache,.mypy_cache,.ruff_cache"), "Comma-separated directories to skip")
|
|
flag.StringVar(&cfg.excludePathsRaw, "exclude-path-contains", envString("EXCLUDE_PATH_CONTAINS", ""), "Comma-separated path substrings to skip")
|
|
flag.Int64Var(&cfg.maxFileSizeBytes, "max-file-size-bytes", envInt64("MAX_FILE_SIZE_BYTES", 2_000_000), "Max file size for analysis")
|
|
flag.Float64Var(&cfg.mitThreshold, "mit-threshold", envFloat("MIT_THRESHOLD", 0.85), "Composite similarity threshold for MIT")
|
|
flag.Float64Var(&cfg.mixedThreshold, "mixed-threshold", envFloat("MIXED_THRESHOLD", 0.25), "Composite similarity threshold for mixed license")
|
|
flag.BoolVar(&cfg.allowPure0BSD, "allow-pure-0bsd", envBool("ALLOW_PURE_0BSD", false), "Allow pure 0BSD classification for files with very low similarity and no direct path match. Disabled by default because textual similarity is not a legal test for derivative-work status.")
|
|
flag.BoolVar(&cfg.pure0BSDConfirmNoDerivation, "pure-0bsd-confirm-no-derivation", envBool("PURE_0BSD_CONFIRM_NO_DERIVATION", false), "Required acknowledgement that files classified as pure 0BSD are not derivative works of upstream sources. Without this flag --allow-pure-0bsd is ignored.")
|
|
flag.Float64Var(&cfg.pure0BSDThreshold, "pure-0bsd-threshold", envFloat("PURE_0BSD_THRESHOLD", 1.0), "Max similarity percent for pure 0BSD")
|
|
flag.StringVar(&cfg.baseLicense, "base-license", envString("BASE_LICENSE", "0BSD AND MIT"), "Default SPDX license below mixed threshold")
|
|
flag.IntVar(&cfg.maxNameCandidates, "max-name-candidates", envInt("MAX_NAME_CANDIDATES", 200), "Max basename candidates when direct path match missing")
|
|
flag.BoolVar(&cfg.writeHeaders, "write-headers", envBool("WRITE_HEADERS", false), "Write SPDX headers")
|
|
flag.BoolVar(&cfg.replaceExisting, "replace-existing", envBool("REPLACE_EXISTING", false), "Replace existing SPDX header")
|
|
flag.BoolVar(&cfg.cleanNonTarget, "clean-non-target-spdx", envBool("CLEAN_NON_TARGET_SPDX", false), "Remove SPDX headers from files outside header-exts")
|
|
flag.StringVar(&cfg.overridesPath, "overrides", envString("OVERRIDES", ""), "Path to JSON map of relpath->SPDX")
|
|
flag.StringVar(&cfg.reportJSON, "report-json", envString("REPORT_JSON", "license-scope-report.json"), "JSON report output path")
|
|
flag.StringVar(&cfg.explainPathsRaw, "explain-paths", envString("EXPLAIN_PATHS", ""), "Comma-separated repo-relative paths to explain scoring decisions")
|
|
flag.BoolVar(&cfg.explainAll, "explain-all", envBool("EXPLAIN_ALL", false), "Print scoring explanation for every analyzed file")
|
|
flag.Parse()
|
|
|
|
cfg.scanExts = parseExtSet(cfg.scanExtsRaw)
|
|
cfg.headerExts = parseExtSet(cfg.headerExtsRaw)
|
|
cfg.excludedDirs = parseStringSet(cfg.excludeDirsRaw)
|
|
cfg.excludedContains = parseStringList(cfg.excludePathsRaw)
|
|
cfg.explainPaths = parsePathSet(cfg.explainPathsRaw)
|
|
|
|
if len(cfg.scanExts) == 0 {
|
|
return cfg, errors.New("scan-exts must include at least one extension")
|
|
}
|
|
if len(cfg.headerExts) == 0 {
|
|
return cfg, errors.New("header-exts must include at least one extension")
|
|
}
|
|
if cfg.mixedThreshold < 0 || cfg.mitThreshold < 0 || cfg.mitThreshold > 1 || cfg.mixedThreshold > cfg.mitThreshold {
|
|
return cfg, errors.New("thresholds must satisfy 0 <= mixed-threshold <= mit-threshold <= 1")
|
|
}
|
|
if cfg.pure0BSDThreshold < 0 || cfg.pure0BSDThreshold > 100 {
|
|
return cfg, errors.New("pure-0bsd-threshold must be in [0,100] percent")
|
|
}
|
|
return cfg, nil
|
|
}
|
|
|
|
func envKey(name string) string { return envPrefix + name }
|
|
|
|
func envString(name, fallback string) string {
|
|
if v, ok := os.LookupEnv(envKey(name)); ok {
|
|
return v
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func envBool(name string, fallback bool) bool {
|
|
if v, ok := os.LookupEnv(envKey(name)); ok {
|
|
b, err := strconv.ParseBool(strings.TrimSpace(v))
|
|
if err == nil {
|
|
return b
|
|
}
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func envInt(name string, fallback int) int {
|
|
if v, ok := os.LookupEnv(envKey(name)); ok {
|
|
i, err := strconv.Atoi(strings.TrimSpace(v))
|
|
if err == nil {
|
|
return i
|
|
}
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func envInt64(name string, fallback int64) int64 {
|
|
if v, ok := os.LookupEnv(envKey(name)); ok {
|
|
i, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64)
|
|
if err == nil {
|
|
return i
|
|
}
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func envFloat(name string, fallback float64) float64 {
|
|
if v, ok := os.LookupEnv(envKey(name)); ok {
|
|
f, err := strconv.ParseFloat(strings.TrimSpace(v), 64)
|
|
if err == nil {
|
|
return f
|
|
}
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func envDuration(name string, fallback time.Duration) time.Duration {
|
|
if v, ok := os.LookupEnv(envKey(name)); ok {
|
|
d, err := time.ParseDuration(strings.TrimSpace(v))
|
|
if err == nil {
|
|
return d
|
|
}
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func parseStringList(raw string) []string {
|
|
out := []string{}
|
|
for _, token := range strings.Split(raw, ",") {
|
|
t := strings.TrimSpace(token)
|
|
if t != "" {
|
|
out = append(out, t)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func parseStringSet(raw string) map[string]struct{} {
|
|
out := map[string]struct{}{}
|
|
for _, t := range parseStringList(raw) {
|
|
out[t] = struct{}{}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func parsePathSet(raw string) map[string]struct{} {
|
|
out := map[string]struct{}{}
|
|
for _, p := range parseStringList(raw) {
|
|
out[filepath.ToSlash(strings.TrimSpace(p))] = struct{}{}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func parseExtSet(raw string) map[string]struct{} {
|
|
out := map[string]struct{}{}
|
|
for _, token := range parseStringList(raw) {
|
|
ext := strings.ToLower(token)
|
|
if !strings.HasPrefix(ext, ".") {
|
|
ext = "." + ext
|
|
}
|
|
out[ext] = struct{}{}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func cloneUpstream(cfg config, target string) error {
|
|
ctx, cancel := context.WithTimeout(context.Background(), cfg.cloneTimeout)
|
|
defer cancel()
|
|
cmd := exec.CommandContext(ctx, "git", "clone", "--depth", "1", "--branch", cfg.upstreamBranch, cfg.upstreamURL, target)
|
|
cmd.Stdout = os.Stdout
|
|
cmd.Stderr = os.Stderr
|
|
if err := cmd.Run(); err != nil {
|
|
if ctx.Err() == context.DeadlineExceeded {
|
|
return fmt.Errorf("git clone timed out after %s", cfg.cloneTimeout)
|
|
}
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func discoverFiles(cfg config, root string) ([]fileData, error) {
|
|
out := []fileData{}
|
|
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, walkErr error) error {
|
|
if walkErr != nil {
|
|
return walkErr
|
|
}
|
|
name := d.Name()
|
|
if d.IsDir() {
|
|
if _, skip := cfg.excludedDirs[name]; skip {
|
|
return filepath.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
rel, err := filepath.Rel(root, path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rel = filepath.ToSlash(rel)
|
|
if shouldSkipFile(cfg, rel, path) {
|
|
return nil
|
|
}
|
|
normHash, lineSet, bigrams, s5, s7, err := loadAndNormalize(path)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
out = append(out, fileData{
|
|
absPath: path,
|
|
relPath: rel,
|
|
baseName: filepath.Base(path),
|
|
normHash: normHash,
|
|
lineSet: lineSet,
|
|
bigramSet: bigrams,
|
|
shingle5Set: s5,
|
|
shingle7Set: s7,
|
|
})
|
|
return nil
|
|
})
|
|
return out, err
|
|
}
|
|
|
|
func shouldSkipFile(cfg config, rel, abs string) bool {
|
|
for _, part := range cfg.excludedContains {
|
|
if strings.Contains(rel, part) {
|
|
return true
|
|
}
|
|
}
|
|
ext := strings.ToLower(filepath.Ext(abs))
|
|
if _, ok := cfg.scanExts[ext]; !ok {
|
|
return true
|
|
}
|
|
if _, ok := commentStyleByExt[ext]; !ok {
|
|
return true
|
|
}
|
|
info, err := os.Stat(abs)
|
|
if err != nil {
|
|
return true
|
|
}
|
|
if info.Size() > cfg.maxFileSizeBytes {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func loadAndNormalize(path string) (string, map[string]struct{}, map[uint64]struct{}, map[uint64]struct{}, map[uint64]struct{}, error) {
|
|
raw, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return "", nil, nil, nil, nil, err
|
|
}
|
|
if bytes.IndexByte(raw, 0) >= 0 {
|
|
return "", nil, nil, nil, nil, errors.New("binary file")
|
|
}
|
|
sc := bufio.NewScanner(bytes.NewReader(raw))
|
|
sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)
|
|
|
|
lines := []string{}
|
|
lineSet := map[string]struct{}{}
|
|
for sc.Scan() {
|
|
line := sc.Text()
|
|
if strings.Contains(line, "SPDX-License-Identifier:") {
|
|
continue
|
|
}
|
|
norm := strings.ToLower(strings.TrimSpace(line))
|
|
if norm == "" {
|
|
continue
|
|
}
|
|
lines = append(lines, norm)
|
|
lineSet[norm] = struct{}{}
|
|
}
|
|
if err := sc.Err(); err != nil {
|
|
return "", nil, nil, nil, nil, err
|
|
}
|
|
sum := sha256.Sum256([]byte(strings.Join(lines, "\n")))
|
|
return hex.EncodeToString(sum[:]), lineSet, makeNGramSet(lines, 2), makeNGramSet(lines, 5), makeNGramSet(lines, 7), nil
|
|
}
|
|
|
|
func findBestMatch(local fileData, byRel map[string]fileData, byBase map[string][]fileData, maxCandidates int) matchResult {
|
|
best := matchResult{}
|
|
if direct, ok := byRel[local.relPath]; ok {
|
|
best = similarityDetail(local, direct)
|
|
best.upstreamRelPath = direct.relPath
|
|
best.directPathMatch = true
|
|
}
|
|
|
|
candidates := byBase[local.baseName]
|
|
if len(candidates) > maxCandidates {
|
|
candidates = candidates[:maxCandidates]
|
|
}
|
|
for _, cand := range candidates {
|
|
if cand.relPath == best.upstreamRelPath {
|
|
continue
|
|
}
|
|
d := similarityDetail(local, cand)
|
|
if d.similarityPct > best.similarityPct {
|
|
d.upstreamRelPath = cand.relPath
|
|
d.directPathMatch = cand.relPath == local.relPath
|
|
best = d
|
|
}
|
|
}
|
|
if best.upstreamRelPath == local.relPath && best.upstreamRelPath != "" {
|
|
best.directPathMatch = true
|
|
}
|
|
return best
|
|
}
|
|
|
|
func similarityDetail(a, b fileData) matchResult {
|
|
if a.normHash != "" && a.normHash == b.normHash {
|
|
return matchResult{
|
|
similarityPct: 100,
|
|
lineJaccardPct: 100,
|
|
bigramDicePct: 100,
|
|
shingle5Pct: 100,
|
|
shingle7Pct: 100,
|
|
lineCommon: len(a.lineSet),
|
|
lineUnion: len(a.lineSet),
|
|
bigramCommon: len(a.bigramSet),
|
|
bigramTotal: len(a.bigramSet) + len(b.bigramSet),
|
|
shingle5Common: len(a.shingle5Set),
|
|
shingle5Union: len(a.shingle5Set),
|
|
shingle7Common: len(a.shingle7Set),
|
|
shingle7Union: len(a.shingle7Set),
|
|
}
|
|
}
|
|
lineJ, lineCommon, lineUnion := setJaccardPctWithCounts(a.lineSet, b.lineSet)
|
|
biD, biCommon, biTotal := setDicePctWithCounts(a.bigramSet, b.bigramSet)
|
|
sh5, sh5Common, sh5Union := setJaccardPctWithCounts(a.shingle5Set, b.shingle5Set)
|
|
sh7, sh7Common, sh7Union := setJaccardPctWithCounts(a.shingle7Set, b.shingle7Set)
|
|
composite := combineComposite(lineJ, biD, sh5, sh7, len(a.bigramSet) > 0 && len(b.bigramSet) > 0, len(a.shingle5Set) > 0 && len(b.shingle5Set) > 0, len(a.shingle7Set) > 0 && len(b.shingle7Set) > 0)
|
|
return matchResult{
|
|
similarityPct: composite,
|
|
lineJaccardPct: lineJ,
|
|
bigramDicePct: biD,
|
|
shingle5Pct: sh5,
|
|
shingle7Pct: sh7,
|
|
lineCommon: lineCommon,
|
|
lineUnion: lineUnion,
|
|
bigramCommon: biCommon,
|
|
bigramTotal: biTotal,
|
|
shingle5Common: sh5Common,
|
|
shingle5Union: sh5Union,
|
|
shingle7Common: sh7Common,
|
|
shingle7Union: sh7Union,
|
|
}
|
|
}
|
|
|
|
// classify maps a similarity result to an SPDX expression. The pure-0BSD
|
|
// branch requires both --allow-pure-0bsd and --pure-0bsd-confirm-no-derivation
|
|
// because textual similarity alone cannot prove non-derivation; renamed or
|
|
// heavily refactored forks of upstream files retain MIT obligations.
|
|
func classify(match matchResult, cfg config) (string, string) {
|
|
sim := match.similarityPct / 100.0
|
|
if sim >= cfg.mitThreshold {
|
|
return "MIT", fmt.Sprintf("composite similarity %.2f%% >= MIT threshold %.2f%%", match.similarityPct, cfg.mitThreshold*100.0)
|
|
}
|
|
if cfg.allowPure0BSD && cfg.pure0BSDConfirmNoDerivation && !match.directPathMatch && match.similarityPct <= cfg.pure0BSDThreshold {
|
|
return "0BSD", fmt.Sprintf("low similarity %.2f%% <= pure-0bsd threshold %.2f%% and no direct path match (operator confirmed non-derivation)", match.similarityPct, cfg.pure0BSDThreshold)
|
|
}
|
|
if sim >= cfg.mixedThreshold {
|
|
return "0BSD AND MIT", fmt.Sprintf("composite similarity %.2f%% >= mixed threshold %.2f%%", match.similarityPct, cfg.mixedThreshold*100.0)
|
|
}
|
|
return cfg.baseLicense, fmt.Sprintf("below mixed threshold; fallback to base license %s", cfg.baseLicense)
|
|
}
|
|
|
|
func makeNGramSet(lines []string, n int) map[uint64]struct{} {
|
|
out := map[uint64]struct{}{}
|
|
if n <= 0 || len(lines) < n {
|
|
return out
|
|
}
|
|
for i := 0; i <= len(lines)-n; i++ {
|
|
out[hashNGram(lines[i:i+n])] = struct{}{}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func hashNGram(parts []string) uint64 {
|
|
h := fnv.New64a()
|
|
for i, p := range parts {
|
|
_, _ = h.Write([]byte(p))
|
|
if i+1 < len(parts) {
|
|
_, _ = h.Write([]byte{0})
|
|
}
|
|
}
|
|
return h.Sum64()
|
|
}
|
|
|
|
func combineComposite(lineJ, biD, sh5, sh7 float64, hasBi, hasS5, hasS7 bool) float64 {
|
|
type part struct {
|
|
score float64
|
|
weight float64
|
|
ok bool
|
|
}
|
|
parts := []part{
|
|
{lineJ, 0.20, true},
|
|
{biD, 0.30, hasBi},
|
|
{sh5, 0.30, hasS5},
|
|
{sh7, 0.20, hasS7},
|
|
}
|
|
weighted := 0.0
|
|
total := 0.0
|
|
for _, p := range parts {
|
|
if !p.ok {
|
|
continue
|
|
}
|
|
weighted += p.score * p.weight
|
|
total += p.weight
|
|
}
|
|
if total <= 0 {
|
|
return 0
|
|
}
|
|
return weighted / total
|
|
}
|
|
|
|
func setJaccardPctWithCounts[T comparable](a, b map[T]struct{}) (float64, int, int) {
|
|
if len(a) == 0 || len(b) == 0 {
|
|
return 0, 0, len(a) + len(b)
|
|
}
|
|
inter := 0
|
|
for k := range a {
|
|
if _, ok := b[k]; ok {
|
|
inter++
|
|
}
|
|
}
|
|
union := len(a) + len(b) - inter
|
|
if union <= 0 {
|
|
return 0, inter, union
|
|
}
|
|
return float64(inter) / float64(union) * 100.0, inter, union
|
|
}
|
|
|
|
func setDicePctWithCounts[T comparable](a, b map[T]struct{}) (float64, int, int) {
|
|
if len(a) == 0 || len(b) == 0 {
|
|
return 0, 0, len(a) + len(b)
|
|
}
|
|
inter := 0
|
|
for k := range a {
|
|
if _, ok := b[k]; ok {
|
|
inter++
|
|
}
|
|
}
|
|
den := len(a) + len(b)
|
|
if den <= 0 {
|
|
return 0, inter, den
|
|
}
|
|
return 2.0 * float64(inter) / float64(den) * 100.0, inter, den
|
|
}
|
|
|
|
func loadOverrides(path string) (map[string]string, error) {
|
|
if strings.TrimSpace(path) == "" {
|
|
return map[string]string{}, nil
|
|
}
|
|
raw, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
loaded := map[string]string{}
|
|
if err := json.Unmarshal(raw, &loaded); err != nil {
|
|
return nil, err
|
|
}
|
|
out := map[string]string{}
|
|
for k, v := range loaded {
|
|
rel := filepath.ToSlash(strings.TrimSpace(k))
|
|
lic := strings.TrimSpace(v)
|
|
if rel != "" && lic != "" {
|
|
out[rel] = lic
|
|
}
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func isHeaderExt(path string, exts map[string]struct{}) bool {
|
|
ext := strings.ToLower(filepath.Ext(path))
|
|
_, ok := exts[ext]
|
|
return ok
|
|
}
|
|
|
|
func removeSPDXFromNonTargetFiles(cfg config) (int, error) {
|
|
rootAbs, _ := filepath.Abs(cfg.repoRoot)
|
|
removed := 0
|
|
err := filepath.WalkDir(rootAbs, func(path string, d fs.DirEntry, walkErr error) error {
|
|
if walkErr != nil {
|
|
return walkErr
|
|
}
|
|
if d.IsDir() {
|
|
if _, skip := cfg.excludedDirs[d.Name()]; skip {
|
|
return filepath.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
rel, err := filepath.Rel(rootAbs, path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rel = filepath.ToSlash(rel)
|
|
if shouldSkipFile(cfg, rel, path) {
|
|
return nil
|
|
}
|
|
if isHeaderExt(path, cfg.headerExts) {
|
|
return nil
|
|
}
|
|
changed, err := removeSPDXLine(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if changed {
|
|
removed++
|
|
}
|
|
return nil
|
|
})
|
|
return removed, err
|
|
}
|
|
|
|
func removeSPDXLine(path string) (bool, error) {
|
|
raw, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
text := string(raw)
|
|
lines := strings.Split(text, "\n")
|
|
maxScan := 25
|
|
if len(lines) < maxScan {
|
|
maxScan = len(lines)
|
|
}
|
|
idx := -1
|
|
for i := 0; i < maxScan; i++ {
|
|
if strings.Contains(lines[i], "SPDX-License-Identifier:") {
|
|
idx = i
|
|
break
|
|
}
|
|
}
|
|
if idx < 0 {
|
|
return false, nil
|
|
}
|
|
out := append([]string{}, lines[:idx]...)
|
|
if idx+1 < len(lines) && strings.TrimSpace(lines[idx+1]) == "" {
|
|
out = append(out, lines[idx+2:]...)
|
|
} else {
|
|
out = append(out, lines[idx+1:]...)
|
|
}
|
|
newText := strings.Join(out, "\n")
|
|
if newText == text {
|
|
return false, nil
|
|
}
|
|
return true, os.WriteFile(path, []byte(newText), 0o644)
|
|
}
|
|
|
|
func upsertSPDX(path, spdx string, replaceExisting bool) (bool, string, error) {
|
|
ext := strings.ToLower(filepath.Ext(path))
|
|
style, ok := commentStyleByExt[ext]
|
|
if !ok {
|
|
return false, "", nil
|
|
}
|
|
raw, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return false, "", err
|
|
}
|
|
text := string(raw)
|
|
lines := strings.Split(text, "\n")
|
|
existingIdx := -1
|
|
existingValue := ""
|
|
maxScan := 25
|
|
if len(lines) < maxScan {
|
|
maxScan = len(lines)
|
|
}
|
|
for i := 0; i < maxScan; i++ {
|
|
if strings.Contains(lines[i], "SPDX-License-Identifier:") {
|
|
existingIdx = i
|
|
m := spdxPattern.FindStringSubmatch(lines[i])
|
|
if len(m) > 1 {
|
|
existingValue = strings.TrimSpace(m[1])
|
|
}
|
|
break
|
|
}
|
|
}
|
|
spdxLine := formatSPDX(style, spdx)
|
|
if existingIdx >= 0 {
|
|
if !replaceExisting {
|
|
return false, existingValue, nil
|
|
}
|
|
lines[existingIdx] = spdxLine
|
|
} else {
|
|
insertAt := 0
|
|
if len(lines) > 0 && strings.HasPrefix(lines[0], "#!") {
|
|
insertAt = 1
|
|
}
|
|
if ext == ".py" && len(lines) > insertAt && strings.HasPrefix(lines[insertAt], "#") && strings.Contains(lines[insertAt], "coding") {
|
|
insertAt++
|
|
}
|
|
prefix := append([]string{}, lines[:insertAt]...)
|
|
suffix := append([]string{}, lines[insertAt:]...)
|
|
lines = append(prefix, spdxLine, "")
|
|
lines = append(lines, suffix...)
|
|
}
|
|
newText := strings.Join(lines, "\n")
|
|
if newText == text {
|
|
return false, existingValue, nil
|
|
}
|
|
return true, existingValue, os.WriteFile(path, []byte(newText), 0o644)
|
|
}
|
|
|
|
func formatSPDX(style, spdx string) string {
|
|
switch style {
|
|
case "hash":
|
|
return "# SPDX-License-Identifier: " + spdx
|
|
case "line":
|
|
return "// SPDX-License-Identifier: " + spdx
|
|
case "block":
|
|
return "/* SPDX-License-Identifier: " + spdx + " */"
|
|
case "html":
|
|
return "<!-- SPDX-License-Identifier: " + spdx + " -->"
|
|
default:
|
|
return "// SPDX-License-Identifier: " + spdx
|
|
}
|
|
}
|
|
|
|
func formatCountSummary(counts map[string]int, total float64) string {
|
|
keys := make([]string, 0, len(counts))
|
|
for k := range counts {
|
|
keys = append(keys, k)
|
|
}
|
|
sort.Strings(keys)
|
|
parts := make([]string, 0, len(keys))
|
|
for _, k := range keys {
|
|
parts = append(parts, fmt.Sprintf("%s: %d (%.2f%%)", k, counts[k], pct(counts[k], total)))
|
|
}
|
|
return strings.Join(parts, ", ")
|
|
}
|
|
|
|
func shouldExplainPath(relPath string, explainAll bool, explainPaths map[string]struct{}) bool {
|
|
if explainAll {
|
|
return true
|
|
}
|
|
_, ok := explainPaths[filepath.ToSlash(relPath)]
|
|
return ok
|
|
}
|
|
|
|
func printExplanation(relPath string, match matchResult, license, decisionReason string, cfg config) {
|
|
fmt.Printf("\n--- score explanation: %s ---\n", relPath)
|
|
if match.upstreamRelPath == "" {
|
|
fmt.Printf("upstream match: <none>\n")
|
|
} else {
|
|
fmt.Printf("upstream match: %s\n", match.upstreamRelPath)
|
|
}
|
|
fmt.Printf("direct path match: %v\n", match.directPathMatch)
|
|
fmt.Printf("composite similarity: %.2f%%\n", match.similarityPct)
|
|
fmt.Printf("line_jaccard: %.2f%% (%d common / %d union)\n", match.lineJaccardPct, match.lineCommon, match.lineUnion)
|
|
fmt.Printf("bigram_dice: %.2f%% (%d common / %d total)\n", match.bigramDicePct, match.bigramCommon, match.bigramTotal)
|
|
fmt.Printf("shingle5_jaccard: %.2f%% (%d common / %d union)\n", match.shingle5Pct, match.shingle5Common, match.shingle5Union)
|
|
fmt.Printf("shingle7_jaccard: %.2f%% (%d common / %d union)\n", match.shingle7Pct, match.shingle7Common, match.shingle7Union)
|
|
fmt.Printf(
|
|
"thresholds: mit=%.2f%% mixed=%.2f%% pure_0bsd=%.2f%% allow_pure_0bsd=%v\n",
|
|
cfg.mitThreshold*100.0,
|
|
cfg.mixedThreshold*100.0,
|
|
cfg.pure0BSDThreshold,
|
|
cfg.allowPure0BSD,
|
|
)
|
|
fmt.Printf("decision: %s (%s)\n", license, decisionReason)
|
|
fmt.Printf("--- end explanation ---\n")
|
|
}
|
|
|
|
func round2(v float64) float64 {
|
|
return float64(int(v*100+0.5)) / 100
|
|
}
|
|
|
|
func pct(count int, total float64) float64 {
|
|
if total <= 0 {
|
|
return 0
|
|
}
|
|
return float64(count) / total * 100.0
|
|
}
|
|
|
|
// emitSafetyNotices prints stderr warnings for flag combinations that can
|
|
// silently relicense or strip third-party license markers. The pure-0BSD
|
|
// branch in classify also requires the explicit confirm flag, so this
|
|
// function reminds the operator when --allow-pure-0bsd is set without it.
|
|
func emitSafetyNotices(cfg config) {
|
|
if cfg.allowPure0BSD && !cfg.pure0BSDConfirmNoDerivation {
|
|
fmt.Fprintln(os.Stderr,
|
|
"warning: --allow-pure-0bsd is set but --pure-0bsd-confirm-no-derivation is not; "+
|
|
"pure 0BSD classification will not be applied. Textual similarity is not a legal "+
|
|
"test for derivative-work status.")
|
|
}
|
|
if cfg.allowPure0BSD && cfg.pure0BSDConfirmNoDerivation {
|
|
fmt.Fprintln(os.Stderr,
|
|
"warning: pure-0BSD classification is enabled. Files with low similarity and no "+
|
|
"direct path match will be labeled 0BSD. Confirm those files are not derivative "+
|
|
"works of upstream sources before publishing.")
|
|
}
|
|
if cfg.cleanNonTarget {
|
|
fmt.Fprintln(os.Stderr,
|
|
"warning: --clean-non-target-spdx is set; SPDX headers in files outside header-exts "+
|
|
"will be removed. Audit the diff before committing to avoid stripping third-party "+
|
|
"license markers in vendored or generated files.")
|
|
}
|
|
if cfg.replaceExisting {
|
|
fmt.Fprintln(os.Stderr,
|
|
"warning: --replace-existing is set; existing SPDX headers will be overwritten by "+
|
|
"the classifier output. Review the report-json before committing.")
|
|
}
|
|
}
|
|
|
|
func exitErr(err error) {
|
|
_, _ = fmt.Fprintf(os.Stderr, "Error: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
func init() {
|
|
_ = runtime.GOMAXPROCS(runtime.NumCPU())
|
|
}
|