Skip to content

Commit 538b99a

Browse files
committed
Added Intel HyperScan
1 parent b41101d commit 538b99a

File tree

9 files changed

+124
-0
lines changed

9 files changed

+124
-0
lines changed

LICENSES/Unlicense.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
Unlicense (Public Domain)
2+
============================
3+
4+
This is free and unencumbered software released into the public domain.
5+
6+
Anyone is free to copy, modify, publish, use, compile, sell, or
7+
distribute this software, either in source code form or as a compiled
8+
binary, for any purpose, commercial or non-commercial, and by any
9+
means.
10+
11+
In jurisdictions that recognize copyright laws, the author or authors
12+
of this software dedicate any and all copyright interest in the
13+
software to the public domain. We make this dedication for the benefit
14+
of the public at large and to the detriment of our heirs and
15+
successors. We intend this dedication to be an overt act of
16+
relinquishment in perpetuity of all present and future rights to this
17+
software under copyright law.
18+
19+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22+
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23+
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24+
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25+
OTHER DEALINGS IN THE SOFTWARE.
26+
27+
For more information, please refer to &lt;<https://unlicense.org/>&gt;
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
<!--
2+
SPDX-FileCopyrightText: KOLANICH, 2021
3+
SPDX-License-Identifier: Unlicense
4+
-->
5+
6+
## Intel Hyperscan
7+
8+
Hyperscan is a library for fast matching regular expressions against binary buffers/streams in large scale.
9+
10+
It serializes precompiled regexps into own binary format. [1](https://github.com/intel/hs/blob/64a995bf445d86b74eb0f375624ffc85682eadfe/src/db.c#L62-L110) [2](https://github.com/intel/hs/blob/64a995bf445d86b74eb0f375624ffc85682eadfe/doc/dev-reference/serialization.rst).
11+
12+
13+
In this dir I have created a demo app extracting HDD model names from text streams and detecting their vendors/brands.
14+
15+
The regexps have been taken from https://github.com/KOLANICH-ML/HDDModelDecoder.py .
16+
17+
The app first generates a "DB", then matches it against the buffer and displays the results for self-check, then generates the serialized representations of "DB"s and stores them into files.
18+
19+
In this dir only "simple" format is present. [Chimera format](https://github.com/intel/hs/blob/64a995bf445d86b74eb0f375624ffc85682eadfe/chimera/ch_db.h) goes to another dir.
20+
21+
```
22+
Version: 5.4.0 Features: AVX2
23+
vectored: Mode: VECTORED
24+
block: Mode: BLOCK
25+
stream_large: Mode: STREAM
26+
```
27+
28+
Source: own work.
15.9 KB
Binary file not shown.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
SPDX-FileCopyrightText: 2021 KOLANICH, 2021
2+
SPDX-License-Identifier: Unlicense
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/usr/bin/env python3
2+
3+
import typing
4+
from pathlib import Path
5+
from pprint import pprint
6+
7+
import hyperscan as hs
8+
from hyperscan import Database
9+
10+
__license__ = "Unlicense"
11+
__copyright__ = "KOLANICH, 2021"
12+
13+
thisDir = Path(__file__).parent
14+
15+
# Regular expressions have been taken from https://github.com/KOLANICH-ML/HDDModelDecoder.py
16+
rxs = {
17+
"HGST": b"([HW])([UDTECMS])([HSCEATNP])(\\d{2}|5C)(\\d{2})(\\d{2})([PDVKA])([L795S])(16|18|36|38|F2|F4|AT|SA|A3|A6|E6|N6|SS|42|52|S6)([0-486M0L])([0-5])",
18+
"Samsung": b"(HD|HE|HM|HN-M|HS|SP)(\\d{2,3})(HI|HJ|GJ|HX|IX|JX|JI|HA|GA|GB|GI|HB|THB|HC|II|IJ|JB|TJB|JJ|JQ|UJQ|LD|LI|LJ|MBB|RHF|RJF|SI|SJ|UI|UJ|VHF|VJF|WI|\\d[NSC])",
19+
"WD": b"(WD)(\\dN|\\d{3}M|\\d{2,})([ABCDEFGHJKLMNPSTX][94BDKLRSWYAFZ0123CEJGHMPUV])?([26ABCDEFGHJKLMRSVWPTYZ1U])([RABCDEFGKSTYVWXZ])"
20+
}
21+
22+
modes = {"vectored": hs.HS_MODE_VECTORED, "block": hs.HS_MODE_BLOCK, "stream_large": hs.HS_MODE_STREAM | hs.HS_MODE_SOM_HORIZON_LARGE}
23+
24+
25+
def prepareDatabase(rxs: typing.Dict[str, bytes], mode: int) -> (Database, typing.List[str]):
26+
flags = hs.HS_FLAG_SOM_LEFTMOST | hs.HS_FLAG_ALLOWEMPTY | hs.HS_FLAG_DOTALL | hs.HS_FLAG_MULTILINE
27+
xs = list(rxs.values())
28+
ks = list(rxs.keys())
29+
fgs = [flags] * len(ks)
30+
31+
db = hs.Database(mode=mode)
32+
db.compile(expressions=xs, ids=list(range(len(ks))), flags=fgs)
33+
return db, ks
34+
35+
36+
def testDb(db: Database, keys: typing.List[str], target: bytes) -> typing.Dict[str, str]:
37+
res = {}
38+
39+
def matchesHandler(iD, start, stop, flags, ctx):
40+
s = slice(start, stop)
41+
model = target[s].decode("utf-8")
42+
vendor = keys[iD]
43+
res[model] = vendor
44+
45+
db.scan(target, matchesHandler)
46+
return res
47+
48+
49+
def main() -> None:
50+
db, keys = prepareDatabase(rxs, modes["block"])
51+
inputStr = b"dcsgdfw HDN724040ALE640 HDN724040ALE640 SP1614N fafafsfa WD2500AVJS vkjsbvhfjs"
52+
pprint(testDb(db, keys, inputStr))
53+
54+
for n, m in modes.items():
55+
fn = thisDir / (n + ".hyperscan_simple")
56+
db, keys = prepareDatabase(rxs, m)
57+
data = hs.dumpb(db)
58+
print(n, "(", hex(len(data)), ")", ":", db.info().decode("utf8"))
59+
fn.write_bytes(data)
60+
61+
62+
if __name__ == "__main__":
63+
main()
15.9 KB
Binary file not shown.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
SPDX-FileCopyrightText: 2021 KOLANICH, 2021
2+
SPDX-License-Identifier: Unlicense
15.9 KB
Binary file not shown.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
SPDX-FileCopyrightText: 2021 KOLANICH, 2021
2+
SPDX-License-Identifier: Unlicense

0 commit comments

Comments
 (0)