Skip to content

Commit 48b7477

Browse files
authored
Merge pull request #24 from oiwn/dev
Updating to new minor version
2 parents 6c414db + bd7aa58 commit 48b7477

File tree

7 files changed

+130
-42
lines changed

7 files changed

+130
-42
lines changed

.github/workflows/ci.yml

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -103,26 +103,6 @@ jobs:
103103
with:
104104
command: "test"
105105

106-
fmt:
107-
name: "Cargo format"
108-
runs-on: "ubuntu-latest"
109-
steps:
110-
- name: "Check out the repo"
111-
uses: actions/checkout@v3
112-
113-
- uses: "actions-rs/toolchain@v1"
114-
with:
115-
profile: "minimal"
116-
toolchain: "stable"
117-
override: true
118-
119-
- run: "rustup component add rustfmt"
120-
121-
- uses: "actions-rs/cargo@v1"
122-
with:
123-
command: "fmt"
124-
args: "--all -- --check"
125-
126106
clippy:
127107
name: "Cargo clippy"
128108
runs-on: "ubuntu-latest"

.github/workflows/coverage.yml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,45 @@
1+
name: Coverage
12

3+
on:
4+
workflow_dispatch:
5+
push:
6+
branches: [ main ]
7+
pull_request:
8+
branches: [ main ]
9+
10+
env:
11+
CARGO_TERM_COLOR: always
12+
13+
jobs:
14+
coverage:
15+
name: Code coverage
16+
runs-on: ubuntu-latest
17+
18+
steps:
19+
- uses: actions/checkout@v4
20+
- uses: dtolnay/rust-toolchain@stable
21+
with:
22+
components: llvm-tools-preview
23+
- uses: actions/cache@v3
24+
with:
25+
path: |
26+
~/.cargo/bin/
27+
~/.cargo/registry/index/
28+
~/.cargo/registry/cache/
29+
~/.cargo/git/db/
30+
target/
31+
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
32+
33+
- name: Install cargo-llvm-cov
34+
uses: taiki-e/install-action@cargo-llvm-cov
35+
36+
- name: Generate code coverage
37+
run: cargo llvm-cov --workspace --lcov --output-path lcov.info
38+
39+
- name: Upload coverage to Codecov
40+
uses: codecov/codecov-action@v3
41+
with:
42+
token: ${{ secrets.CODECOV_TOKEN }}
43+
files: lcov.info
44+
fail_ci_if_error: true
45+
verbose: true

.github/workflows/publish.yml

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,59 @@
1-
name: Publish on crates.io
1+
name: Publish to crates.io
22

33
on:
4-
pull_request:
5-
types:
6-
- closed
4+
push:
75
branches:
86
- main
7+
paths:
8+
- 'Cargo.toml'
99

1010
jobs:
11-
publish:
12-
if: github.event.pull_request.merged == true
11+
check-version:
1312
runs-on: ubuntu-latest
13+
outputs:
14+
should_publish: ${{ steps.compare_versions.outputs.should_publish }}
1415

1516
steps:
16-
- name: Checkout repository
17-
uses: actions/checkout@v2
17+
- uses: actions/checkout@v4
1818
with:
19-
fetch-depth: 0
20-
ref: main
19+
fetch-depth: 0 # Fetch all history for comparing versions
20+
21+
- name: Compare versions
22+
id: compare_versions
23+
run: |
24+
# Get the current version from Cargo.toml
25+
CURRENT_VERSION=$(grep '^version = ' Cargo.toml | cut -d '"' -f 2)
26+
echo "Current version: $CURRENT_VERSION"
27+
28+
# Get the previous version from the last commit
29+
git checkout HEAD~1
30+
PREVIOUS_VERSION=$(grep '^version = ' Cargo.toml | cut -d '"' -f 2)
31+
echo "Previous version: $PREVIOUS_VERSION"
32+
33+
# Compare versions
34+
if [ "$CURRENT_VERSION" != "$PREVIOUS_VERSION" ]; then
35+
echo "Version changed from $PREVIOUS_VERSION to $CURRENT_VERSION"
36+
echo "should_publish=true" >> "$GITHUB_OUTPUT"
37+
else
38+
echo "Version unchanged"
39+
echo "should_publish=false" >> "$GITHUB_OUTPUT"
40+
fi
41+
42+
publish:
43+
needs: check-version
44+
if: needs.check-version.outputs.should_publish == 'true'
45+
runs-on: ubuntu-latest
46+
47+
steps:
48+
- uses: actions/checkout@v4
2149

2250
- name: Setup Rust
23-
uses: actions-rs/toolchain@v1
24-
with:
25-
profile: minimal
26-
toolchain: stable
27-
override: true
28-
51+
uses: dtolnay/rust-toolchain@stable
52+
53+
- name: Check if package is publishable
54+
run: cargo publish --dry-run
55+
2956
- name: Publish to crates.io
30-
if: steps.check_version.outputs.version_updated == 'true'
3157
run: cargo publish --token ${{ secrets.CRATES_IO_TOKEN }}
58+
59+

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,5 @@
99
/data
1010
*.profraw
1111
all_code.txt
12+
.code
13+
.amc.toml

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "dom-content-extraction"
3-
version = "0.3.6"
3+
version = "0.3.7"
44

55
description = "Rust implementation of Content extraction via text density paper"
66
license = "MPL-2.0"

notes.org

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
#+title: Notes
22

3+
* Unicode documents handling
4+
** TODO I think there are problems processing unicode.
5+
36
* Microtasks
4-
** Better split for CI/CD workflows
7+
** DONE Better split for CI/CD workflows
58
** DONE 13 Nov - updating, removing unwraps.
69
*** DONE add custom errors, remove unwraps
710
*** DONE forbid unwrap in linting rules
8-
** TODO coverage should be >80%
9-
** TODO integrate cargo-tarpaulin or gcov into the github ci pipeline
11+
** DONE coverage should be >80%
12+
** DONE integrate cargo-tarpaulin or gcov into the github ci pipeline
1013
** TODO cargo publish workflow
11-
** TODO add clear copy-pastable example into readme.md
14+
** DONE add clear copy-pastable example into readme.md
1215
** DONE need "examples" command for "lorem ipsum" test page
1316
** DONE need "benchmark"
1417
** DONE need to implement testing among real data set

src/lib.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -885,4 +885,35 @@ mod tests {
885885

886886
assert!(!extracted_content.contains("Menu"));
887887
}
888+
889+
#[test]
890+
fn test_document_node_handling() {
891+
// Create a minimal HTML document that forces Document node traversal
892+
let html = r#"<!DOCTYPE html><html><body><div>Test</div></body></html>"#;
893+
let document = Html::parse_document(html);
894+
895+
// Get the root node which should be a Document node
896+
let root_node = document.tree.root();
897+
assert!(matches!(root_node.value(), scraper::Node::Document));
898+
899+
// Create a DensityTree starting from root to ensure Document node is encountered
900+
let mut density_tree = DensityTree::new(root_node.id());
901+
DensityTree::build_density_tree(
902+
root_node,
903+
&mut density_tree.tree.root_mut(),
904+
1,
905+
);
906+
907+
// If we reach here without panicking and the tree is built,
908+
// it means the Document node was properly skipped
909+
assert!(density_tree.tree.root().children().count() > 0);
910+
911+
// Verify the content is still processed despite skipping Document node
912+
let text_nodes: Vec<_> = density_tree
913+
.tree
914+
.nodes()
915+
.filter(|n| n.value().char_count > 0)
916+
.collect();
917+
assert!(!text_nodes.is_empty());
918+
}
888919
}

0 commit comments

Comments
 (0)