Skip to content

Commit e03dd1e

Browse files
committed
文件扩展名拆分 使用预编译的DFA
1 parent d54653f commit e03dd1e

11 files changed

+415
-100
lines changed

Cargo.lock

Lines changed: 246 additions & 55 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,6 @@ name = "torrent-tidy"
33
version = "0.1.0"
44
edition = "2021"
55

6-
[dependencies]
7-
reqwest = { version = "0.12", features = ["json", "cookies"] }
8-
serde = { version = "1.0", features = ["derive"] }
9-
serde_json = "1.0"
10-
tokio = { version = "1", features = ["full"] }
11-
clap = { version = "4.5", features = ["derive"] }
12-
regex = { version = "1.11" }
13-
146
[workspace]
157
# 指定解析器版本
168
resolver = "2"
@@ -22,3 +14,18 @@ codegen-units = 1 # Reduce number of codegen units to increase optimizations
2214
panic = 'abort' # Abort on panic
2315
strip = true # Strip symbols from binary*, strip = true is equivalent to strip = "symbols"
2416
debug = false # Disable debug info
17+
18+
[dependencies]
19+
reqwest = { version = "0.12", features = ["json", "cookies"] }
20+
serde = { version = "1.0", features = ["derive"] }
21+
tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
22+
clap = { version = "4.5", features = ["derive"] }
23+
regex = { version = "1.11" }
24+
regex-automata = { version = "0.4", features = ["std", "dfa-search"] }
25+
26+
[dev-dependencies]
27+
criterion = "0.5"
28+
29+
[[bench]]
30+
name = "my_benchmark"
31+
harness = false
77.3 KB
Binary file not shown.
77.3 KB
Binary file not shown.

benches/my_benchmark.rs

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
use std::sync::LazyLock;
2+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
3+
use regex::Regex;
4+
use regex_automata::{dfa::Automaton, Anchored, Input};
5+
6+
fn split_filename(filename: &str) -> (String, String) {
7+
static RE: LazyLock<Regex> = LazyLock::new(|| {
8+
Regex::new(r"^(.*?)\.(tar\.gz|tar\.xz|tar\.bz2|cpio\.gz|cpio\.bz2|(?:7z|rar|zip)\.\d{3}|[^.]+)$").unwrap()
9+
});
10+
11+
if let Some(caps) = RE.captures(filename) {
12+
(caps.get(1).unwrap().as_str().to_string(), caps.get(2).unwrap().as_str().to_string())
13+
} else {
14+
(filename.to_string(), String::new()) // 无扩展名的情况
15+
}
16+
}
17+
18+
fn split_filename_old(name: &str) -> (String, String) {
19+
if let Some(dot_pos) = name.rfind('.') {
20+
if dot_pos == 0 || dot_pos == name.len() - 1 {
21+
(name.to_string(), String::new())
22+
} else {
23+
let (stem, ext_with_dot) = name.split_at(dot_pos);
24+
let ext = &ext_with_dot[1..];
25+
(stem.to_string(), ext.to_string())
26+
}
27+
} else {
28+
(name.to_string(), String::new())
29+
}
30+
}
31+
32+
use regex_automata::{
33+
dfa::dense::DFA,
34+
util::{lazy::Lazy, wire::AlignAs},
35+
};
36+
37+
pub static FILE_EXTENSION_SPLIT: Lazy<DFA<&'static [u32]>> = Lazy::new(|| {
38+
static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
39+
_align: [],
40+
#[cfg(target_endian = "big")]
41+
bytes: *include_bytes!("file_extension_split.bigendian.dfa"),
42+
#[cfg(target_endian = "little")]
43+
bytes: *include_bytes!("file_extension_split.littleendian.dfa"),
44+
};
45+
let (dfa, _) = regex_automata::dfa::dense::DFA::from_bytes(&ALIGNED.bytes).expect("serialized DFA should be valid");
46+
dfa
47+
});
48+
49+
/// 将文件名拆分为主名和扩展名 FILE_EXTENSION_SPLIT
50+
fn split_filename_new(filename: &str) -> (String, String) {
51+
let input = Input::new(filename).anchored(Anchored::Yes);
52+
match FILE_EXTENSION_SPLIT.try_search_rev(&input) {
53+
Ok(Some(index)) => {
54+
let (main, ext) = filename.split_at(index.offset());
55+
// 去除index位置的点
56+
(main.into(), ext[1..].into())
57+
}
58+
Ok(None) | Err(_) => {
59+
(filename.to_string(), String::new())
60+
},
61+
}
62+
}
63+
64+
65+
fn criterion_benchmark(c: &mut Criterion) {
66+
c.bench_function("file.with.dots.txt", |b| b.iter(|| split_filename(black_box("file.with.dots.txt"))));
67+
c.bench_function("a.b.c.d.f", |b| b.iter(|| split_filename(black_box("a.b.c.d.f"))));
68+
69+
c.bench_function("file.with.dots.txt.old", |b| b.iter(|| split_filename_old(black_box("file.with.dots.txt"))));
70+
c.bench_function("a.b.c.d.f.old", |b| b.iter(|| split_filename_old(black_box("a.b.c.d.f"))));
71+
72+
c.bench_function("file.with.dots.txt.new", |b| b.iter(|| split_filename_new(black_box("file.with.dots.txt"))));
73+
c.bench_function("a.b.c.d.f.new", |b| b.iter(|| split_filename_new(black_box("a.b.c.d.f"))));
74+
}
75+
76+
criterion_group!(benches, criterion_benchmark);
77+
criterion_main!(benches);

src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
mod logger;
22
mod q_bit;
3+
mod re;
34

45
use crate::logger::LogUnwrap;
56
use clap::Parser;
@@ -52,7 +53,6 @@ async fn main() {
5253
})
5354
.collect();
5455

55-
5656
let mut builder = Client::builder().cookie_store(true);
5757

5858
if !args.vpn {

src/q_bit.rs

Lines changed: 2 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
use std::sync::LazyLock;
2-
use crate::log;
1+
use crate::{log, re};
32

43
use crate::logger::LogUnwrap;
54
use regex::Regex;
@@ -217,7 +216,7 @@ fn apply_rename_rules(name: &str, compiled_rules: &Vec<(Regex, &str)>) -> String
217216

218217
/// 将文件名应用重命名规则,不改变文件扩展名
219218
fn apply_rename_rules_to_file(name: &str, compiled_rules: &Vec<(Regex, &str)>) -> String {
220-
let (mut stem, ext) = split_filename(name);
219+
let (mut stem, ext) = re::split_filename(name);
221220

222221
// 仅对主名部分应用替换规则
223222
for (re, replacement) in compiled_rules {
@@ -232,37 +231,4 @@ fn apply_rename_rules_to_file(name: &str, compiled_rules: &Vec<(Regex, &str)>) -
232231
} else {
233232
format!("{}.{}", stem, ext)
234233
}
235-
}
236-
237-
238-
239-
/// 将文件名拆分为主名和扩展名 FILE_EXTENSION_SPLIT
240-
fn split_filename(filename: &str) -> (String, String) {
241-
static RE: LazyLock<Regex> = LazyLock::new(|| {
242-
Regex::new(r"^(.*?)\.(tar\.(?:gz|xz|bz2)|cpio\.(?:gz|bz2)|(?:7z|rar|zip)\.\d{3}|[^.]+)$").unwrap()
243-
});
244-
245-
RE.captures(filename)
246-
.map(|caps| (caps[1].to_string(), caps[2].to_string()))
247-
.unwrap_or_else(|| (filename.to_string(), String::new()))
248-
}
249-
250-
251-
#[cfg(test)]
252-
mod tests {
253-
use super::*;
254-
255-
#[test]
256-
fn test_2025_02_17_16_36_27() {
257-
assert_eq!(split_filename(""), ("".into(), "".into()));
258-
assert_eq!(split_filename("."), (".".into(), "".into()));
259-
assert_eq!(split_filename("f"), ("f".into(), "".into()));
260-
assert_eq!(split_filename(".f"), ("".into(), "f".into()));
261-
assert_eq!(split_filename("f."), ("f.".into(), "".into()));
262-
assert_eq!(split_filename("a.b.c.d.f"), ("a.b.c.d".into(), "f".into()));
263-
assert_eq!(split_filename("abc.tar.gz"), ("abc".into(), "tar.gz".into()));
264-
assert_eq!(split_filename("abc.7z.001"), ("abc".into(), "7z.001".into()));
265-
assert_eq!(split_filename("file.with.dots.txt"), ("file.with.dots".into(), "txt".into()));
266-
assert_eq!(split_filename("no_extension"), ("no_extension".into(), "".into()));
267-
}
268234
}
77.3 KB
Binary file not shown.
77.3 KB
Binary file not shown.

src/re/file_extension_split.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
2+
//
3+
// C:\Users\muyuanjin\.cargo\bin\regex-cli.exe generate serialize dense dfa --minimize --shrink --start-kind anchored --rustfmt --safe --reverse --captures none FILE_EXTENSION_SPLIT ./src/re/ \.(tar\.(?:gz|xz|bz2)|cpio\.(?:gz|bz2)|(?:7z|rar|zip)\.\d{3}|[^.]+)
4+
//
5+
// regex-cli 0.2.1 is available on crates.io.
6+
7+
use regex_automata::{
8+
dfa::dense::DFA,
9+
util::{lazy::Lazy, wire::AlignAs},
10+
};
11+
12+
pub static FILE_EXTENSION_SPLIT: Lazy<DFA<&'static [u32]>> = Lazy::new(|| {
13+
static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
14+
_align: [],
15+
#[cfg(target_endian = "big")]
16+
bytes: *include_bytes!("file_extension_split.bigendian.dfa"),
17+
#[cfg(target_endian = "little")]
18+
bytes: *include_bytes!("file_extension_split.littleendian.dfa"),
19+
};
20+
let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes).expect("serialized DFA should be valid");
21+
dfa
22+
});

0 commit comments

Comments
 (0)