Skip to content

Commit 729d10c

Browse files
committed
fix(ci): remove WASM size check for Workers
1 parent f99dc30 commit 729d10c

File tree

203 files changed

+3689
-250
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

203 files changed

+3689
-250
lines changed

.github/workflows/ci-wasm.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -408,9 +408,6 @@ jobs:
408408
- name: Build WASM (all targets for E2E)
409409
run: task wasm:build:all
410410

411-
- name: Verify WASM size for Workers
412-
run: scripts/ci/wasm/verify-workers-wasm-size.sh
413-
414411
- name: Generate Workers E2E tests
415412
run: task wasm:e2e:workers:generate
416413

packages/csharp/Benchmark/Program.cs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
using System.Text.Json;
33
using Kreuzberg;
44

5-
var benchConfig = new ExtractionConfig { UseCache = false };
65
var debug = Environment.GetEnvironmentVariable("KREUZBERG_BENCHMARK_DEBUG") == "true";
76
var argsSpan = args.AsSpan();
87

@@ -43,6 +42,10 @@
4342

4443
var mode = argsSpan[modeIndex].ToString();
4544

45+
var benchConfig = ocrEnabled
46+
? new ExtractionConfig { UseCache = false, Ocr = new OcrConfig { } }
47+
: new ExtractionConfig { UseCache = false };
48+
4649
if (debug)
4750
{
4851
Console.Error.WriteLine("[DEBUG] Starting C# benchmark");
@@ -88,10 +91,12 @@
8891
var result = KreuzbergClient.ExtractBytesSync(content, mimeType, benchConfig);
8992
sw.Stop();
9093

94+
var ocrUsed = result.Metadata?.Format?.Type == FormatType.Ocr;
9195
var output = new
9296
{
9397
content = result.Content,
94-
_extraction_time_ms = sw.Elapsed.TotalMilliseconds
98+
_extraction_time_ms = sw.Elapsed.TotalMilliseconds,
99+
_ocr_used = ocrUsed
95100
};
96101

97102
var json = JsonSerializer.Serialize(output);
@@ -146,10 +151,12 @@
146151
var result = KreuzbergClient.ExtractBytesSync(content, mimeType, benchConfig);
147152
sw.Stop();
148153

154+
var ocrUsed = result.Metadata?.Format?.Type == FormatType.Ocr;
149155
var output = new
150156
{
151157
content = result.Content,
152-
_extraction_time_ms = sw.Elapsed.TotalMilliseconds
158+
_extraction_time_ms = sw.Elapsed.TotalMilliseconds,
159+
_ocr_used = ocrUsed
153160
};
154161

155162
var json = JsonSerializer.Serialize(output);

scripts/ci/wasm/verify-workers-wasm-size.sh

Lines changed: 0 additions & 21 deletions
This file was deleted.
File renamed without changes.
File renamed without changes.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Nasdaq & AMEX
2+
3+
Stocks in bold rose or fell 5% or more
4+
5+
## USA TODAY
6+
Track your investments with our continuously updated stocks. Visit us on the web at money.usatoday.com
7+
8+
| 52-week High | Low | Stock | Last Change | 52-week High | Low | Stock | Last Change |
9+
|---|---|---|---|---|---|---|---|
10+
| | | — A — | | 45.71 | 32.50 | Blomar | 36.71 | -0.42 |
11+
| 9.19 | 6.89 | ABX Air n | 7.52 | -0.10 | 2.76 | 1.20 | Blomiro | 1.46 | +0.03 |
12+
| 40.29 | 12.40 | ACMoore | 13.58 | -1.57 | 9.07 | 5.13 | BioChip | -8.05 | +0.34 |
13+
| 31.38 | 13.31 | ADA — ES | 26.36 | +3.16 | 68.88 | 50.65 | Biosite | 60.06 | -2.57 |
14+
| 27.14 | 12.68 | ADC Tel rs | 23.21 | +0.13 | 212.25 | 131.03 | BiotechT | 204.66 | -0.84 |
15+
| 39.40 | 16.42 | ADCO | 27.32 | -0.73 | 8.50 | 3.40 | BioMaBit | 6.52 | -0.45 |
16+
| 16.45 | 10.37 | APE Ent s | 15.40 | -0.14 | 18.21 | 10.73 | Blackbud | 17.90 | +0.70 |
17+
| 8.37 | 4.42 | ASE Tra | 7.16 | +0.40 | 52.73 | 13.86 | BioQsd | 41.29 | +1.30 |
18+
| 19.25 | 12.75 | ASM Intl | 17.65 | -0.03 | 14.34 | 6.12 | Bodisen n | 15.43 | +0.45 |
19+
| 20.92 | 11.39 | ASML Hld | 21.24 | +0.46 | 6.21 | 1.56 | Bodum | 5.94 | +0.06 |
20+
| 27.38 | 14.39 | ASV Inc s | 26.76 | +0.14 | 11.80 | 4.99 | Borlond | 6.68 | +0.14 |
21+
| 19.87 | 10.42 | ATI Inc | 17.89 | -0.68 | 31.92 | 21.10 | BosPrv | 31.18 | -0.07 |
22+
| 33.62 | 20.53 | ATMI Inc | 29.95 | -1.29 | 18.62 | 10.01 | BtmInt | 11.53 | +0.20 |
23+
| 99.20 | 16.76 | ATP OKG | 58.40 | -0.59 | 11.24 | 7.10 | BriExo | 12.10 | -0.23 |
24+
| 4.24 | 1.99 | AVI Bio | 3.62 | -0.02 | 46.72 | 26.65 | BrightHr s | 38.90 | -0.80 |
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Nasdaq & AMEX
2+
3+
Stocks in bold rose or fell 5% or more
4+
5+
## USA TODAY
6+
Track your investments with our continuously updated stocks. Visit us on the web at money.usatoday.com
7+
8+
| 52-week High | Low | Stock | Last Change | 52-week High | Low | Stock | Last Change |
9+
|---|---|---|---|---|---|---|---|
10+
| | | — A — | | 45.71 | 32.50 | Blomar | 36.71 | -0.42 |
11+
| 9.19 | 6.89 | ABX Air n | 7.52 | -0.10 | 2.76 | 1.20 | Blomiro | 1.46 | +0.03 |
12+
| 40.29 | 12.40 | ACMoore | 13.58 | -1.57 | 9.07 | 5.13 | BioChip | -8.05 | +0.34 |
13+
| 31.38 | 13.31 | ADA — ES | 26.36 | +3.16 | 68.88 | 50.65 | Biosite | 60.06 | -2.57 |
14+
| 27.14 | 12.68 | ADC Tel rs | 23.21 | +0.13 | 212.25 | 131.03 | BiotechT | 204.66 | -0.84 |
15+
| 39.40 | 16.42 | ADCO | 27.32 | -0.73 | 8.50 | 3.40 | BioMaBit | 6.52 | -0.45 |
16+
| 16.45 | 10.37 | APE Ent s | 15.40 | -0.14 | 18.21 | 10.73 | Blackbud | 17.90 | +0.70 |
17+
| 8.37 | 4.42 | ASE Tra | 7.16 | +0.40 | 52.73 | 13.86 | BioQsd | 41.29 | +1.30 |
18+
| 19.25 | 12.75 | ASM Intl | 17.65 | -0.03 | 14.34 | 6.12 | Bodisen n | 15.43 | +0.45 |
19+
| 20.92 | 11.39 | ASML Hld | 21.24 | +0.46 | 6.21 | 1.56 | Bodum | 5.94 | +0.06 |
20+
| 27.38 | 14.39 | ASV Inc s | 26.76 | +0.14 | 11.80 | 4.99 | Borlond | 6.68 | +0.14 |
21+
| 19.87 | 10.42 | ATI Inc | 17.89 | -0.68 | 31.92 | 21.10 | BosPrv | 31.18 | -0.07 |
22+
| 33.62 | 20.53 | ATMI Inc | 29.95 | -1.29 | 18.62 | 10.01 | BtmInt | 11.53 | +0.20 |
23+
| 99.20 | 16.76 | ATP OKG | 58.40 | -0.59 | 11.24 | 7.10 | BriExo | 12.10 | -0.23 |
24+
| 4.24 | 1.99 | AVI Bio | 3.62 | -0.02 | 46.72 | 26.65 | BrightHr s | 38.90 | -0.80 |
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
2+
3+
4+
5+
6+
7+
8+
9+
10+
11+
12+
13+
14+
15+
16+
17+
18+
19+
20+
21+
22+
23+
24+
25+
26+
27+
28+
29+
30+
31+
32+
33+
34+
35+
36+
37+
38+
39+
40+
41+
42+
43+
44+
45+
46+
47+
48+
49+
50+
51+
52+
53+
54+
55+
56+
57+
58+
59+
60+
61+
62+
63+
64+
65+
66+
67+
68+
69+
70+
71+
72+
73+
74+
75+
76+
77+
78+
79+
80+
81+
82+
83+
84+
85+
86+
87+
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# LayoutParser: A Unified Toolkit for DL-Based DIA
2+
3+
## Table 1: Current layout detection models in the LayoutParser model zoo
4+
5+
| Dataset | Base Model | Large Model | Notes |
6+
|---------|-----------|------------|-------|
7+
| PubLayNet [38] | P / M | M | Layouts of modern scientific documents |
8+
| Manga [5] | M | - | Layouts of scanned modern magazines and scientific reports |
9+
| Newspaper [17] | P | - | Layouts of scanned US newspapers from the 20th century |
10+
| TableBank [44] | P | - | Table region on modern scientific and business document |
11+
| HJDataset [91] | P / M | - | Layouts of history Japanese documents |
12+
13+
For each dataset, we train several model of different sizes for different needs (the trade-off between accuracy vs. computational cost). For "base model" and "large model", we refer to using the ResNet 50 or ResNet 101 backbone [15]. The past 1 architecture and mAP backbone). The platform is maintained and a number of additions will be made to the model zoo in coming months.
14+
15+
**layout data structures**, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the **OCR module**. 4) LayoutParser comes with a set of utility functions for the **visualization and storage of the layout data**. 5) LayoutParser is also highly customizable, via its integration with functions for **layout data annotation and model training**. We now provide detailed descriptions for each component.
16+
17+
## 3.1 Layout Detection Models
18+
19+
In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [36], provides a minimal API that can perform layout detection with only four lines of code in Python:
20+
21+
```
22+
1 import layoutparser as lp
23+
2 image = cv2.imread("image_file") # load image
24+
3 model = lp.Detectron2LayoutModel(
25+
4 "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config")
26+
5 layout = model.detect(image)
27+
```
28+
29+
LayoutParser provides a wealth of pre-trained model weights using various datasets covering different languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are applied to target samples that are significantly different from the training dataset. As document structures and layouts vary greatly in different domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name: lp://\<dataset-name>/\<model-architecture-name>.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Nasdaq & AMEX
2+
3+
Stocks in bold rose or fell 5% or more
4+
5+
## USA TODAY
6+
Track your investments with our continuously updated stocks. Visit us on the web at money.usatoday.com
7+
8+
| 52-week High | Low | Stock | Last Change | 52-week High | Low | Stock | Last Change |
9+
|---|---|---|---|---|---|---|---|
10+
| | | — A — | | 45.71 | 32.50 | Blomar | 36.71 | -0.42 |
11+
| 9.19 | 6.89 | ABX Air n | 7.52 | -0.10 | 2.76 | 1.20 | Blomiro | 1.46 | +0.03 |
12+
| 40.29 | 12.40 | ACMoore | 13.58 | -1.57 | 9.07 | 5.13 | BioChip | -8.05 | +0.34 |
13+
| 31.38 | 13.31 | ADA — ES | 26.36 | +3.16 | 68.88 | 50.65 | Biosite | 60.06 | -2.57 |
14+
| 27.14 | 12.68 | ADC Tel rs | 23.21 | +0.13 | 212.25 | 131.03 | BiotechT | 204.66 | -0.84 |
15+
| 39.40 | 16.42 | ADCO | 27.32 | -0.73 | 8.50 | 3.40 | BioMaBit | 6.52 | -0.45 |
16+
| 16.45 | 10.37 | APE Ent s | 15.40 | -0.14 | 18.21 | 10.73 | Blackbud | 17.90 | +0.70 |
17+
| 8.37 | 4.42 | ASE Tra | 7.16 | +0.40 | 52.73 | 13.86 | BioQsd | 41.29 | +1.30 |
18+
| 19.25 | 12.75 | ASM Intl | 17.65 | -0.03 | 14.34 | 6.12 | Bodisen n | 15.43 | +0.45 |
19+
| 20.92 | 11.39 | ASML Hld | 21.24 | +0.46 | 6.21 | 1.56 | Bodum | 5.94 | +0.06 |
20+
| 27.38 | 14.39 | ASV Inc s | 26.76 | +0.14 | 11.80 | 4.99 | Borlond | 6.68 | +0.14 |
21+
| 19.87 | 10.42 | ATI Inc | 17.89 | -0.68 | 31.92 | 21.10 | BosPrv | 31.18 | -0.07 |
22+
| 33.62 | 20.53 | ATMI Inc | 29.95 | -1.29 | 18.62 | 10.01 | BtmInt | 11.53 | +0.20 |
23+
| 99.20 | 16.76 | ATP OKG | 58.40 | -0.59 | 11.24 | 7.10 | BriExo | 12.10 | -0.23 |
24+
| 4.24 | 1.99 | AVI Bio | 3.62 | -0.02 | 46.72 | 26.65 | BrightHr s | 38.90 | -0.80 |

0 commit comments

Comments
 (0)