|
| 1 | +name: Generate Archive |
| 2 | + |
| 3 | +on: |
| 4 | + repository_dispatch: |
| 5 | + types: [generate-archive] |
| 6 | + |
| 7 | +jobs: |
| 8 | + archive: |
| 9 | + name: Generate Dataset Archive |
| 10 | + runs-on: ubuntu-latest |
| 11 | + env: |
| 12 | + DATASET_ID: ${{ github.event.client_payload.dataset_id }} |
| 13 | + VERSION: ${{ github.event.client_payload.version }} |
| 14 | + steps: |
| 15 | + - name: Validate inputs |
| 16 | + run: | |
| 17 | + if [ -z "$DATASET_ID" ]; then |
| 18 | + echo "::error::Missing dataset_id in client_payload" |
| 19 | + exit 1 |
| 20 | + fi |
| 21 | + if [ -z "$VERSION" ]; then |
| 22 | + echo "::error::Missing version in client_payload" |
| 23 | + exit 1 |
| 24 | + fi |
| 25 | +
|
| 26 | + - uses: actions/checkout@v4 |
| 27 | + with: |
| 28 | + ref: v${{ github.event.client_payload.version }} |
| 29 | + |
| 30 | + - uses: actions/setup-node@v4 |
| 31 | + with: |
| 32 | + node-version: "20" |
| 33 | + |
| 34 | + - name: Install streaming dependencies |
| 35 | + run: | |
| 36 | + mkdir -p /tmp/archive-deps |
| 37 | + cd /tmp/archive-deps |
| 38 | + npm init -y > /dev/null |
| 39 | + npm install --no-save archiver @aws-sdk/client-s3 @aws-sdk/lib-storage |
| 40 | +
|
| 41 | + - name: Write archive script |
| 42 | + run: | |
| 43 | + cat > /tmp/stream-archive.js << 'ARCHIVE_SCRIPT' |
| 44 | + var fs = require("fs"); |
| 45 | + var path = require("path"); |
| 46 | + var S3Client = require("@aws-sdk/client-s3").S3Client; |
| 47 | + var Upload = require("@aws-sdk/lib-storage").Upload; |
| 48 | + var archiver = require("archiver"); |
| 49 | + var PassThrough = require("stream").PassThrough; |
| 50 | + var https = require("https"); |
| 51 | + var http = require("http"); |
| 52 | +
|
| 53 | + var DATASET_ID = process.env.DATASET_ID; |
| 54 | + var VERSION = process.env.VERSION; |
| 55 | + var BUCKET = "nemar"; |
| 56 | + var REGION = process.env.AWS_DEFAULT_REGION || "us-east-2"; |
| 57 | + var S3_BASE = "https://" + BUCKET + ".s3." + REGION + ".amazonaws.com"; |
| 58 | +
|
| 59 | + function resolveAnnexKey(filePath) { |
| 60 | + try { |
| 61 | + var stat = fs.lstatSync(filePath); |
| 62 | + if (stat.isSymbolicLink()) { |
| 63 | + var target = fs.readlinkSync(filePath); |
| 64 | + var m = target.match(/([^\/]+)\/\1$/); |
| 65 | + if (m) return m[1]; |
| 66 | + var m2 = target.match(/\/annex\/objects\/(.+)$/); |
| 67 | + if (m2) return m2[1]; |
| 68 | + } else if (stat.isFile() && stat.size < 500 && stat.size > 20) { |
| 69 | + var content = fs.readFileSync(filePath, "utf8").trim(); |
| 70 | + var m3 = content.match(/^\/annex\/objects\/(.+)$/); |
| 71 | + if (m3) return m3[1]; |
| 72 | + } |
| 73 | + } catch (e) { |
| 74 | + console.warn(" resolveAnnexKey failed for " + filePath + ": " + e.message); |
| 75 | + } |
| 76 | + return null; |
| 77 | + } |
| 78 | +
|
| 79 | + function fetchUrl(url) { |
| 80 | + return new Promise(function (resolve, reject) { |
| 81 | + var mod = url.indexOf("https") === 0 ? https : http; |
| 82 | + mod |
| 83 | + .get(url, function (res) { |
| 84 | + if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { |
| 85 | + fetchUrl(res.headers.location).then(resolve).catch(reject); |
| 86 | + return; |
| 87 | + } |
| 88 | + if (res.statusCode !== 200) { |
| 89 | + res.resume(); |
| 90 | + reject(new Error("HTTP " + res.statusCode + " for " + url)); |
| 91 | + return; |
| 92 | + } |
| 93 | + resolve(res); |
| 94 | + }) |
| 95 | + .on("error", reject); |
| 96 | + }); |
| 97 | + } |
| 98 | +
|
| 99 | + function walkDir(dir, base) { |
| 100 | + base = base || ""; |
| 101 | + var result = []; |
| 102 | + var entries = fs.readdirSync(dir, { withFileTypes: true }); |
| 103 | + for (var i = 0; i < entries.length; i++) { |
| 104 | + var entry = entries[i]; |
| 105 | + if (entry.name === ".git" || entry.name === ".github" || entry.name === "node_modules") continue; |
| 106 | + var rel = base ? base + "/" + entry.name : entry.name; |
| 107 | + var full = path.join(dir, entry.name); |
| 108 | + if (entry.isDirectory()) { |
| 109 | + result = result.concat(walkDir(full, rel)); |
| 110 | + } else { |
| 111 | + result.push({ rel: rel, full: full }); |
| 112 | + } |
| 113 | + } |
| 114 | + return result; |
| 115 | + } |
| 116 | +
|
| 117 | + async function main() { |
| 118 | + console.log("Streaming archive for " + DATASET_ID + " v" + VERSION); |
| 119 | +
|
| 120 | + var archive = archiver("zip", { zlib: { level: 1 } }); |
| 121 | + var passThrough = new PassThrough(); |
| 122 | + archive.pipe(passThrough); |
| 123 | +
|
| 124 | + archive.on("warning", function (err) { |
| 125 | + console.warn("Archive warning:", err.message); |
| 126 | + }); |
| 127 | + archive.on("error", function (err) { |
| 128 | + console.error("Archive error:", err.message); |
| 129 | + process.exitCode = 1; |
| 130 | + }); |
| 131 | + passThrough.on("error", function (err) { |
| 132 | + console.error("Stream error:", err.message); |
| 133 | + process.exitCode = 1; |
| 134 | + }); |
| 135 | +
|
| 136 | + var s3 = new S3Client({ region: REGION }); |
| 137 | + var s3Key = DATASET_ID + "/archives/v" + VERSION + ".zip"; |
| 138 | +
|
| 139 | + var upload = new Upload({ |
| 140 | + client: s3, |
| 141 | + params: { |
| 142 | + Bucket: BUCKET, |
| 143 | + Key: s3Key, |
| 144 | + Body: passThrough, |
| 145 | + ContentType: "application/zip", |
| 146 | + }, |
| 147 | + queueSize: 4, |
| 148 | + partSize: 100 * 1024 * 1024, |
| 149 | + }); |
| 150 | +
|
| 151 | + var uploadDone = upload.done().catch(function (err) { |
| 152 | + console.error("S3 Upload error:", err.message); |
| 153 | + process.exitCode = 1; |
| 154 | + throw err; |
| 155 | + }); |
| 156 | +
|
| 157 | + var files = walkDir("."); |
| 158 | + console.log("Found " + files.length + " files"); |
| 159 | +
|
| 160 | + var annexed = 0; |
| 161 | + var regular = 0; |
| 162 | + var skipped = 0; |
| 163 | +
|
| 164 | + for (var i = 0; i < files.length; i++) { |
| 165 | + var rel = files[i].rel; |
| 166 | + var full = files[i].full; |
| 167 | + var annexKey = resolveAnnexKey(full); |
| 168 | +
|
| 169 | + try { |
| 170 | + var entryDone = new Promise(function (resolve, reject) { |
| 171 | + archive.once("entry", resolve); |
| 172 | + archive.once("error", reject); |
| 173 | + }); |
| 174 | + if (annexKey) { |
| 175 | + var url = S3_BASE + "/" + DATASET_ID + "/objects/" + encodeURIComponent(annexKey); |
| 176 | + var stream = await fetchUrl(url); |
| 177 | + archive.append(stream, { name: rel }); |
| 178 | + } else { |
| 179 | + archive.append(fs.createReadStream(full), { name: rel }); |
| 180 | + } |
| 181 | + await entryDone; |
| 182 | + if (annexKey) annexed++; |
| 183 | + else regular++; |
| 184 | + } catch (err) { |
| 185 | + skipped++; |
| 186 | + if (skipped <= 10) { |
| 187 | + console.warn(" Skipping " + rel + ": " + err.message); |
| 188 | + } else if (skipped === 11) { |
| 189 | + console.warn(" (suppressing further skip warnings)"); |
| 190 | + } |
| 191 | + } |
| 192 | +
|
| 193 | + if ((annexed + regular + skipped) % 100 === 0) { |
| 194 | + console.log(" Progress: " + (annexed + regular + skipped) + "/" + files.length); |
| 195 | + } |
| 196 | + } |
| 197 | +
|
| 198 | + await archive.finalize(); |
| 199 | + await uploadDone; |
| 200 | +
|
| 201 | + console.log("Archive complete: " + annexed + " annexed + " + regular + " regular + " + skipped + " skipped"); |
| 202 | + console.log("Uploaded to s3://" + BUCKET + "/" + s3Key); |
| 203 | + if (skipped > 0) { |
| 204 | + console.warn("WARNING: " + skipped + " annexed files were not found in S3"); |
| 205 | + } |
| 206 | + } |
| 207 | +
|
| 208 | + process.on("unhandledRejection", function (err) { |
| 209 | + console.error("Unhandled rejection:", err); |
| 210 | + process.exitCode = 1; |
| 211 | + }); |
| 212 | +
|
| 213 | + main().catch(function (err) { |
| 214 | + console.error("Fatal:", err); |
| 215 | + process.exitCode = 1; |
| 216 | + }); |
| 217 | + ARCHIVE_SCRIPT |
| 218 | +
|
| 219 | + - name: Stream archive to S3 |
| 220 | + env: |
| 221 | + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} |
| 222 | + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} |
| 223 | + AWS_DEFAULT_REGION: us-east-2 |
| 224 | + NODE_PATH: /tmp/archive-deps/node_modules |
| 225 | + run: node /tmp/stream-archive.js |
0 commit comments