Skip to content

Commit f23984f

Browse files
committed
Add generate-archive.yml workflow
1 parent 5c2cafe commit f23984f

File tree

1 file changed

+225
-0
lines changed

1 file changed

+225
-0
lines changed
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
name: Generate Archive
2+
3+
on:
4+
repository_dispatch:
5+
types: [generate-archive]
6+
7+
jobs:
8+
archive:
9+
name: Generate Dataset Archive
10+
runs-on: ubuntu-latest
11+
env:
12+
DATASET_ID: ${{ github.event.client_payload.dataset_id }}
13+
VERSION: ${{ github.event.client_payload.version }}
14+
steps:
15+
- name: Validate inputs
16+
run: |
17+
if [ -z "$DATASET_ID" ]; then
18+
echo "::error::Missing dataset_id in client_payload"
19+
exit 1
20+
fi
21+
if [ -z "$VERSION" ]; then
22+
echo "::error::Missing version in client_payload"
23+
exit 1
24+
fi
25+
26+
- uses: actions/checkout@v4
27+
with:
28+
ref: v${{ github.event.client_payload.version }}
29+
30+
- uses: actions/setup-node@v4
31+
with:
32+
node-version: "20"
33+
34+
- name: Install streaming dependencies
35+
run: |
36+
mkdir -p /tmp/archive-deps
37+
cd /tmp/archive-deps
38+
npm init -y > /dev/null
39+
npm install --no-save archiver @aws-sdk/client-s3 @aws-sdk/lib-storage
40+
41+
- name: Write archive script
42+
run: |
43+
cat > /tmp/stream-archive.js << 'ARCHIVE_SCRIPT'
44+
var fs = require("fs");
45+
var path = require("path");
46+
var S3Client = require("@aws-sdk/client-s3").S3Client;
47+
var Upload = require("@aws-sdk/lib-storage").Upload;
48+
var archiver = require("archiver");
49+
var PassThrough = require("stream").PassThrough;
50+
var https = require("https");
51+
var http = require("http");
52+
53+
var DATASET_ID = process.env.DATASET_ID;
54+
var VERSION = process.env.VERSION;
55+
var BUCKET = "nemar";
56+
var REGION = process.env.AWS_DEFAULT_REGION || "us-east-2";
57+
var S3_BASE = "https://" + BUCKET + ".s3." + REGION + ".amazonaws.com";
58+
59+
function resolveAnnexKey(filePath) {
60+
try {
61+
var stat = fs.lstatSync(filePath);
62+
if (stat.isSymbolicLink()) {
63+
var target = fs.readlinkSync(filePath);
64+
var m = target.match(/([^\/]+)\/\1$/);
65+
if (m) return m[1];
66+
var m2 = target.match(/\/annex\/objects\/(.+)$/);
67+
if (m2) return m2[1];
68+
} else if (stat.isFile() && stat.size < 500 && stat.size > 20) {
69+
var content = fs.readFileSync(filePath, "utf8").trim();
70+
var m3 = content.match(/^\/annex\/objects\/(.+)$/);
71+
if (m3) return m3[1];
72+
}
73+
} catch (e) {
74+
console.warn(" resolveAnnexKey failed for " + filePath + ": " + e.message);
75+
}
76+
return null;
77+
}
78+
79+
function fetchUrl(url) {
80+
return new Promise(function (resolve, reject) {
81+
var mod = url.indexOf("https") === 0 ? https : http;
82+
mod
83+
.get(url, function (res) {
84+
if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
85+
fetchUrl(res.headers.location).then(resolve).catch(reject);
86+
return;
87+
}
88+
if (res.statusCode !== 200) {
89+
res.resume();
90+
reject(new Error("HTTP " + res.statusCode + " for " + url));
91+
return;
92+
}
93+
resolve(res);
94+
})
95+
.on("error", reject);
96+
});
97+
}
98+
99+
function walkDir(dir, base) {
100+
base = base || "";
101+
var result = [];
102+
var entries = fs.readdirSync(dir, { withFileTypes: true });
103+
for (var i = 0; i < entries.length; i++) {
104+
var entry = entries[i];
105+
if (entry.name === ".git" || entry.name === ".github" || entry.name === "node_modules") continue;
106+
var rel = base ? base + "/" + entry.name : entry.name;
107+
var full = path.join(dir, entry.name);
108+
if (entry.isDirectory()) {
109+
result = result.concat(walkDir(full, rel));
110+
} else {
111+
result.push({ rel: rel, full: full });
112+
}
113+
}
114+
return result;
115+
}
116+
117+
async function main() {
118+
console.log("Streaming archive for " + DATASET_ID + " v" + VERSION);
119+
120+
var archive = archiver("zip", { zlib: { level: 1 } });
121+
var passThrough = new PassThrough();
122+
archive.pipe(passThrough);
123+
124+
archive.on("warning", function (err) {
125+
console.warn("Archive warning:", err.message);
126+
});
127+
archive.on("error", function (err) {
128+
console.error("Archive error:", err.message);
129+
process.exitCode = 1;
130+
});
131+
passThrough.on("error", function (err) {
132+
console.error("Stream error:", err.message);
133+
process.exitCode = 1;
134+
});
135+
136+
var s3 = new S3Client({ region: REGION });
137+
var s3Key = DATASET_ID + "/archives/v" + VERSION + ".zip";
138+
139+
var upload = new Upload({
140+
client: s3,
141+
params: {
142+
Bucket: BUCKET,
143+
Key: s3Key,
144+
Body: passThrough,
145+
ContentType: "application/zip",
146+
},
147+
queueSize: 4,
148+
partSize: 100 * 1024 * 1024,
149+
});
150+
151+
var uploadDone = upload.done().catch(function (err) {
152+
console.error("S3 Upload error:", err.message);
153+
process.exitCode = 1;
154+
throw err;
155+
});
156+
157+
var files = walkDir(".");
158+
console.log("Found " + files.length + " files");
159+
160+
var annexed = 0;
161+
var regular = 0;
162+
var skipped = 0;
163+
164+
for (var i = 0; i < files.length; i++) {
165+
var rel = files[i].rel;
166+
var full = files[i].full;
167+
var annexKey = resolveAnnexKey(full);
168+
169+
try {
170+
var entryDone = new Promise(function (resolve, reject) {
171+
archive.once("entry", resolve);
172+
archive.once("error", reject);
173+
});
174+
if (annexKey) {
175+
var url = S3_BASE + "/" + DATASET_ID + "/objects/" + encodeURIComponent(annexKey);
176+
var stream = await fetchUrl(url);
177+
archive.append(stream, { name: rel });
178+
} else {
179+
archive.append(fs.createReadStream(full), { name: rel });
180+
}
181+
await entryDone;
182+
if (annexKey) annexed++;
183+
else regular++;
184+
} catch (err) {
185+
skipped++;
186+
if (skipped <= 10) {
187+
console.warn(" Skipping " + rel + ": " + err.message);
188+
} else if (skipped === 11) {
189+
console.warn(" (suppressing further skip warnings)");
190+
}
191+
}
192+
193+
if ((annexed + regular + skipped) % 100 === 0) {
194+
console.log(" Progress: " + (annexed + regular + skipped) + "/" + files.length);
195+
}
196+
}
197+
198+
await archive.finalize();
199+
await uploadDone;
200+
201+
console.log("Archive complete: " + annexed + " annexed + " + regular + " regular + " + skipped + " skipped");
202+
console.log("Uploaded to s3://" + BUCKET + "/" + s3Key);
203+
if (skipped > 0) {
204+
console.warn("WARNING: " + skipped + " annexed files were not found in S3");
205+
}
206+
}
207+
208+
process.on("unhandledRejection", function (err) {
209+
console.error("Unhandled rejection:", err);
210+
process.exitCode = 1;
211+
});
212+
213+
main().catch(function (err) {
214+
console.error("Fatal:", err);
215+
process.exitCode = 1;
216+
});
217+
ARCHIVE_SCRIPT
218+
219+
- name: Stream archive to S3
220+
env:
221+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
222+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
223+
AWS_DEFAULT_REGION: us-east-2
224+
NODE_PATH: /tmp/archive-deps/node_modules
225+
run: node /tmp/stream-archive.js

0 commit comments

Comments
 (0)