Skip to content

Commit af11f1c

Browse files
committed
feat: Add dynamic package fetching from registries
Adds automatic discovery of crypto packages from npm, PyPI, Go proxy, and Maven Central. The database now grows from ~70 curated packages to 1000+ packages through intelligent inference. Features: - Registry clients for npm, PyPI, Go, Maven - Algorithm inference engine (infers RSA, AES, etc. from package names) - Confidence levels (verified, high, medium, low) - Curated data always takes priority over inferred Usage: go run cmd/gendb/main.go # curated only (69 packages) go run cmd/gendb/main.go --fetch # curated + inferred (1000+ packages) The weekly workflow now uses --fetch to automatically discover new crypto packages and expand the database coverage.
1 parent f548caf commit af11f1c

File tree

10 files changed

+1476
-13
lines changed

10 files changed

+1476
-13
lines changed

.github/workflows/update-database.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,10 @@ jobs:
5151
5252
- name: Generate updated database
5353
run: |
54-
go run cmd/gendb/main.go > data/crypto-database.json
54+
go run cmd/gendb/main.go --fetch --timeout=2m > data/crypto-database.json
5555
echo "Generated database with $(jq '.packages | length' data/crypto-database.json) packages"
56+
echo " Verified: $(jq '.stats.verifiedPackages' data/crypto-database.json)"
57+
echo " Inferred: $(jq '.stats.inferredPackages' data/crypto-database.json)"
5658
5759
- name: Check for changes
5860
id: changes

cmd/gendb/main.go

Lines changed: 112 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,64 @@
11
// Copyright 2024-2025 CSNP (csnp.org)
22
// SPDX-License-Identifier: Apache-2.0
33

4-
// Command gendb generates a JSON database file from the embedded database.
5-
// This is used to create the remote database that users can download.
4+
// Command gendb generates a JSON database file from the embedded database
5+
// and optionally fetches additional packages from package registries.
66
//
7-
// Usage: go run cmd/gendb/main.go > data/crypto-database.json
7+
// Usage:
8+
//
9+
// go run cmd/gendb/main.go > data/crypto-database.json # curated only
10+
// go run cmd/gendb/main.go --fetch > data/crypto-database.json # curated + registry
811
package main
912

1013
import (
14+
"context"
1115
"encoding/json"
16+
"flag"
1217
"fmt"
1318
"os"
1419
"sort"
1520
"time"
1621

1722
"github.com/csnp/qramm-cryptodeps/internal/database"
23+
"github.com/csnp/qramm-cryptodeps/internal/registry"
1824
"github.com/csnp/qramm-cryptodeps/pkg/types"
1925
)
2026

2127
// DatabaseExport is the format for the exported database.
2228
type DatabaseExport struct {
2329
Version string `json:"version"`
2430
UpdatedAt string `json:"updatedAt"`
31+
Stats DatabaseStats `json:"stats"`
2532
Packages []types.PackageAnalysis `json:"packages"`
2633
}
2734

35+
// DatabaseStats contains statistics about the database.
36+
type DatabaseStats struct {
37+
TotalPackages int `json:"totalPackages"`
38+
VerifiedPackages int `json:"verifiedPackages"`
39+
InferredPackages int `json:"inferredPackages"`
40+
ByEcosystem map[types.Ecosystem]int `json:"byEcosystem"`
41+
ByConfidence map[types.Confidence]int `json:"byConfidence"`
42+
}
43+
44+
var (
45+
fetchFlag = flag.Bool("fetch", false, "Fetch additional packages from registries")
46+
timeoutFlag = flag.Duration("timeout", 2*time.Minute, "Timeout for registry fetches")
47+
)
48+
2849
func main() {
29-
// Load the embedded database
30-
db := database.NewEmbedded()
50+
flag.Parse()
3151

32-
// Export all packages
33-
packages := db.ExportAll()
52+
var packages []types.PackageAnalysis
53+
var stats DatabaseStats
54+
55+
if *fetchFlag {
56+
// Fetch from registries and merge with curated data
57+
packages, stats = fetchAndMerge()
58+
} else {
59+
// Curated only
60+
packages, stats = curatedOnly()
61+
}
3462

3563
// Sort by ecosystem then package name for consistent output
3664
sort.Slice(packages, func(i, j int) bool {
@@ -41,17 +69,20 @@ func main() {
4169
})
4270

4371
export := DatabaseExport{
44-
Version: "1.0.0",
72+
Version: "1.1.0",
4573
UpdatedAt: time.Now().UTC().Format(time.RFC3339),
74+
Stats: stats,
4675
Packages: packages,
4776
}
4877

4978
// Stats to stderr
50-
stats := db.Stats()
51-
fmt.Fprintf(os.Stderr, "Exporting CryptoDeps database:\n")
79+
fmt.Fprintf(os.Stderr, "CryptoDeps Database Export:\n")
5280
fmt.Fprintf(os.Stderr, " Total packages: %d\n", stats.TotalPackages)
53-
for ecosystem, count := range stats.ByEcosystem {
54-
fmt.Fprintf(os.Stderr, " %s: %d\n", ecosystem, count)
81+
fmt.Fprintf(os.Stderr, " Verified: %d\n", stats.VerifiedPackages)
82+
fmt.Fprintf(os.Stderr, " Inferred: %d\n", stats.InferredPackages)
83+
fmt.Fprintf(os.Stderr, " By ecosystem:\n")
84+
for eco, count := range stats.ByEcosystem {
85+
fmt.Fprintf(os.Stderr, " %s: %d\n", eco, count)
5586
}
5687

5788
// Marshal and output to stdout
@@ -63,3 +94,72 @@ func main() {
6394

6495
fmt.Println(string(output))
6596
}
97+
98+
func curatedOnly() ([]types.PackageAnalysis, DatabaseStats) {
99+
db := database.NewEmbedded()
100+
packages := db.ExportAll()
101+
102+
// Mark all as verified
103+
for i := range packages {
104+
for j := range packages[i].Crypto {
105+
packages[i].Crypto[j].Confidence = types.ConfidenceVerified
106+
}
107+
}
108+
109+
stats := DatabaseStats{
110+
TotalPackages: len(packages),
111+
VerifiedPackages: len(packages),
112+
InferredPackages: 0,
113+
ByEcosystem: make(map[types.Ecosystem]int),
114+
ByConfidence: make(map[types.Confidence]int),
115+
}
116+
117+
for _, pkg := range packages {
118+
stats.ByEcosystem[pkg.Ecosystem]++
119+
}
120+
stats.ByConfidence[types.ConfidenceVerified] = len(packages)
121+
122+
return packages, stats
123+
}
124+
125+
func fetchAndMerge() ([]types.PackageAnalysis, DatabaseStats) {
126+
ctx, cancel := context.WithTimeout(context.Background(), *timeoutFlag)
127+
defer cancel()
128+
129+
fmt.Fprintf(os.Stderr, "Fetching packages from registries (timeout: %s)...\n", *timeoutFlag)
130+
131+
merger := registry.NewMerger()
132+
result, err := merger.Merge(ctx)
133+
if err != nil {
134+
fmt.Fprintf(os.Stderr, "Warning: merge error: %v\n", err)
135+
}
136+
137+
// Log any registry errors
138+
for _, e := range result.Errors {
139+
fmt.Fprintf(os.Stderr, "Warning: %v\n", e)
140+
}
141+
142+
stats := DatabaseStats{
143+
TotalPackages: result.Stats.TotalPackages,
144+
VerifiedPackages: result.Stats.CuratedPackages,
145+
InferredPackages: result.Stats.InferredPackages,
146+
ByEcosystem: make(map[types.Ecosystem]int),
147+
ByConfidence: make(map[types.Confidence]int),
148+
}
149+
150+
// Count by ecosystem
151+
for eco, count := range result.Stats.ByEcosystem {
152+
stats.ByEcosystem[eco] = count
153+
}
154+
155+
// Count by confidence
156+
for _, pkg := range result.Packages {
157+
for _, crypto := range pkg.Crypto {
158+
stats.ByConfidence[crypto.Confidence]++
159+
}
160+
}
161+
162+
fmt.Fprintf(os.Stderr, "Fetched %d new packages from registries\n", result.Stats.NewPackages)
163+
164+
return result.Packages, stats
165+
}

internal/registry/fetcher.go

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
// Copyright 2024-2025 CSNP (csnp.org)
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
// Package registry fetches crypto package information from package registries.
5+
package registry
6+
7+
import (
8+
"context"
9+
"fmt"
10+
"sync"
11+
"time"
12+
13+
"github.com/csnp/qramm-cryptodeps/pkg/types"
14+
)
15+
16+
// PackageInfo contains information about a package from a registry.
17+
type PackageInfo struct {
18+
Name string
19+
Version string
20+
Description string
21+
Ecosystem types.Ecosystem
22+
Downloads int64
23+
Repository string
24+
Keywords []string
25+
License string
26+
UpdatedAt time.Time
27+
}
28+
29+
// Client defines the interface for registry clients.
30+
type Client interface {
31+
// Name returns the registry name.
32+
Name() string
33+
// Ecosystem returns the ecosystem this client handles.
34+
Ecosystem() types.Ecosystem
35+
// SearchCrypto searches for crypto-related packages.
36+
SearchCrypto(ctx context.Context) ([]PackageInfo, error)
37+
}
38+
39+
// Fetcher aggregates results from multiple registry clients.
40+
type Fetcher struct {
41+
clients []Client
42+
timeout time.Duration
43+
}
44+
45+
// NewFetcher creates a new registry fetcher with all supported clients.
46+
func NewFetcher() *Fetcher {
47+
return &Fetcher{
48+
clients: []Client{
49+
NewNPMClient(),
50+
NewPyPIClient(),
51+
NewGoClient(),
52+
NewMavenClient(),
53+
},
54+
timeout: 60 * time.Second,
55+
}
56+
}
57+
58+
// FetchResult contains results from a single registry.
59+
type FetchResult struct {
60+
Ecosystem types.Ecosystem
61+
Packages []PackageInfo
62+
Error error
63+
}
64+
65+
// FetchAll fetches crypto packages from all registries concurrently.
66+
func (f *Fetcher) FetchAll(ctx context.Context) ([]PackageInfo, []error) {
67+
ctx, cancel := context.WithTimeout(ctx, f.timeout)
68+
defer cancel()
69+
70+
var wg sync.WaitGroup
71+
results := make(chan FetchResult, len(f.clients))
72+
73+
for _, client := range f.clients {
74+
wg.Add(1)
75+
go func(c Client) {
76+
defer wg.Done()
77+
packages, err := c.SearchCrypto(ctx)
78+
results <- FetchResult{
79+
Ecosystem: c.Ecosystem(),
80+
Packages: packages,
81+
Error: err,
82+
}
83+
}(client)
84+
}
85+
86+
go func() {
87+
wg.Wait()
88+
close(results)
89+
}()
90+
91+
var allPackages []PackageInfo
92+
var errors []error
93+
94+
for result := range results {
95+
if result.Error != nil {
96+
errors = append(errors, fmt.Errorf("%s: %w", result.Ecosystem, result.Error))
97+
continue
98+
}
99+
allPackages = append(allPackages, result.Packages...)
100+
}
101+
102+
return allPackages, errors
103+
}
104+
105+
// cryptoKeywords are terms that indicate a package is crypto-related.
106+
var cryptoKeywords = []string{
107+
"crypto", "cryptography", "encryption", "decrypt",
108+
"cipher", "aes", "rsa", "ecdsa", "ed25519",
109+
"sha256", "sha512", "hash", "hmac", "pbkdf",
110+
"jwt", "jws", "jwe", "jose",
111+
"tls", "ssl", "certificate", "x509",
112+
"bcrypt", "scrypt", "argon2",
113+
"pgp", "gpg", "openpgp",
114+
"nacl", "sodium", "curve25519",
115+
"ecdh", "diffie-hellman", "key-exchange",
116+
"signature", "signing", "verify",
117+
"pem", "pkcs", "asn1",
118+
"random", "prng", "csprng",
119+
"post-quantum", "pqc", "kyber", "dilithium",
120+
}
121+
122+
// isCryptoRelated checks if a package appears to be crypto-related.
123+
func isCryptoRelated(name, description string, keywords []string) bool {
124+
// Check package name
125+
nameLower := toLowerCase(name)
126+
for _, kw := range cryptoKeywords {
127+
if containsWord(nameLower, kw) {
128+
return true
129+
}
130+
}
131+
132+
// Check description
133+
descLower := toLowerCase(description)
134+
for _, kw := range cryptoKeywords {
135+
if containsWord(descLower, kw) {
136+
return true
137+
}
138+
}
139+
140+
// Check keywords
141+
for _, keyword := range keywords {
142+
kwLower := toLowerCase(keyword)
143+
for _, cryptoKw := range cryptoKeywords {
144+
if kwLower == cryptoKw || containsWord(kwLower, cryptoKw) {
145+
return true
146+
}
147+
}
148+
}
149+
150+
return false
151+
}
152+
153+
func toLowerCase(s string) string {
154+
result := make([]byte, len(s))
155+
for i := 0; i < len(s); i++ {
156+
c := s[i]
157+
if c >= 'A' && c <= 'Z' {
158+
c += 'a' - 'A'
159+
}
160+
result[i] = c
161+
}
162+
return string(result)
163+
}
164+
165+
func containsWord(s, word string) bool {
166+
if len(word) > len(s) {
167+
return false
168+
}
169+
for i := 0; i <= len(s)-len(word); i++ {
170+
if s[i:i+len(word)] == word {
171+
// Check word boundaries
172+
before := i == 0 || !isAlphaNum(s[i-1])
173+
after := i+len(word) == len(s) || !isAlphaNum(s[i+len(word)])
174+
if before && after {
175+
return true
176+
}
177+
// Also match as substring for compound words
178+
if i > 0 && s[i-1] == '-' {
179+
return true
180+
}
181+
if i+len(word) < len(s) && s[i+len(word)] == '-' {
182+
return true
183+
}
184+
}
185+
}
186+
// Fallback: simple substring match for short keywords
187+
if len(word) >= 4 {
188+
for i := 0; i <= len(s)-len(word); i++ {
189+
if s[i:i+len(word)] == word {
190+
return true
191+
}
192+
}
193+
}
194+
return false
195+
}
196+
197+
func isAlphaNum(c byte) bool {
198+
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')
199+
}

0 commit comments

Comments
 (0)