Skip to content

Commit a2cebd4

Browse files
committed
Improve NARC homoglyph handling and use faster/more powerful regex module (#24369)
1 parent e9564e4 commit a2cebd4

File tree

3 files changed

+152
-12
lines changed

3 files changed

+152
-12
lines changed

requirements/prod.txt

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,138 @@ pillow==12.1.0 \
380380
PyJWT==2.10.1 \
381381
--hash=sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953 \
382382
--hash=sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb
383+
regex==2026.1.15 \
384+
--hash=sha256:0057de9eaef45783ff69fa94ae9f0fd906d629d0bd4c3217048f46d1daa32e9b \
385+
--hash=sha256:008b185f235acd1e53787333e5690082e4f156c44c87d894f880056089e9bc7c \
386+
--hash=sha256:05d75a668e9ea16f832390d22131fe1e8acc8389a694c8febc3e340b0f810b93 \
387+
--hash=sha256:069f56a7bf71d286a6ff932a9e6fb878f151c998ebb2519a9f6d1cee4bffdba3 \
388+
--hash=sha256:0751a26ad39d4f2ade8fe16c59b2bf5cb19eb3d2cd543e709e583d559bd9efde \
389+
--hash=sha256:08df9722d9b87834a3d701f3fca570b2be115654dbfd30179f30ab2f39d606d3 \
390+
--hash=sha256:0bda75ebcac38d884240914c6c43d8ab5fb82e74cde6da94b43b17c411aa4c2b \
391+
--hash=sha256:0bf065240704cb8951cc04972cf107063917022511273e0969bdb34fc173456c \
392+
--hash=sha256:0bf650f26087363434c4e560011f8e4e738f6f3e029b85d4904c50135b86cfa5 \
393+
--hash=sha256:0dcd31594264029b57bf16f37fd7248a70b3b764ed9e0839a8f271b2d22c0785 \
394+
--hash=sha256:0f0c7684c7f9ca241344ff95a1de964f257a5251968484270e91c25a755532c5 \
395+
--hash=sha256:124dc36c85d34ef2d9164da41a53c1c8c122cfb1f6e1ec377a1f27ee81deb794 \
396+
--hash=sha256:164759aa25575cbc0651bef59a0b18353e54300d79ace8084c818ad8ac72b7d5 \
397+
--hash=sha256:166551807ec20d47ceaeec380081f843e88c8949780cd42c40f18d16168bed10 \
398+
--hash=sha256:1704d204bd42b6bb80167df0e4554f35c255b579ba99616def38f69e14a5ccb9 \
399+
--hash=sha256:18388a62989c72ac24de75f1449d0fb0b04dfccd0a1a7c1c43af5eb503d890f6 \
400+
--hash=sha256:194312a14819d3e44628a44ed6fea6898fdbecb0550089d84c403475138d0a09 \
401+
--hash=sha256:1ae6020fb311f68d753b7efa9d4b9a5d47a5d6466ea0d5e3b5a471a960ea6e4a \
402+
--hash=sha256:1cb740d044aff31898804e7bf1181cc72c03d11dfd19932b9911ffc19a79070a \
403+
--hash=sha256:1e1808471fbe44c1a63e5f577a1d5f02fe5d66031dcbdf12f093ffc1305a858e \
404+
--hash=sha256:1e8cd52557603f5c66a548f69421310886b28b7066853089e1a71ee710e1cdc1 \
405+
--hash=sha256:21ca32c28c30d5d65fc9886ff576fc9b59bbca08933e844fa2363e530f4c8218 \
406+
--hash=sha256:2748c1ec0663580b4510bd89941a31560b4b439a0b428b49472a3d9944d11cd8 \
407+
--hash=sha256:27618391db7bdaf87ac6c92b31e8f0dfb83a9de0075855152b720140bda177a2 \
408+
--hash=sha256:2a8d7b50c34578d0d3bf7ad58cde9652b7d683691876f83aedc002862a35dc5e \
409+
--hash=sha256:2b091aefc05c78d286657cd4db95f2e6313375ff65dcf085e42e4c04d9c8d410 \
410+
--hash=sha256:2c2b80399a422348ce5de4fe40c418d6299a0fa2803dd61dc0b1a2f28e280fcf \
411+
--hash=sha256:2f2775843ca49360508d080eaa87f94fa248e2c946bbcd963bb3aae14f333413 \
412+
--hash=sha256:3038a62fc7d6e5547b8915a3d927a0fbeef84cdbe0b1deb8c99bbd4a8961b52a \
413+
--hash=sha256:32655d17905e7ff8ba5c764c43cb124e34a9245e45b83c22e81041e1071aee10 \
414+
--hash=sha256:343db82cb3712c31ddf720f097ef17c11dab2f67f7a3e7be976c4f82eba4e6df \
415+
--hash=sha256:3601ffb5375de85a16f407854d11cca8fe3f5febbe3ac78fb2866bb220c74d10 \
416+
--hash=sha256:3d6ce5ae80066b319ae3bc62fd55a557c9491baa5efd0d355f0de08c4ba54e79 \
417+
--hash=sha256:3d7d92495f47567a9b1669c51fc8d6d809821849063d168121ef801bbc213846 \
418+
--hash=sha256:40c86d8046915bb9aeb15d3f3f15b6fd500b8ea4485b30e1bbc799dab3fe29f8 \
419+
--hash=sha256:4161d87f85fa831e31469bfd82c186923070fc970b9de75339b68f0c75b51903 \
420+
--hash=sha256:41aef6f953283291c4e4e6850607bd71502be67779586a61472beacb315c97ec \
421+
--hash=sha256:453078802f1b9e2b7303fb79222c054cb18e76f7bdc220f7530fdc85d319f99e \
422+
--hash=sha256:492534a0ab925d1db998defc3c302dae3616a2fc3fe2e08db1472348f096ddf2 \
423+
--hash=sha256:4c5ef43b5c2d4114eb8ea424bb8c9cec01d5d17f242af88b2448f5ee81caadbc \
424+
--hash=sha256:4c8fcc5793dde01641a35905d6731ee1548f02b956815f8f1cab89e515a5bdf1 \
425+
--hash=sha256:4def140aa6156bc64ee9912383d4038f3fdd18fee03a6f222abd4de6357ce42a \
426+
--hash=sha256:4e3dd93c8f9abe8aa4b6c652016da9a3afa190df5ad822907efe6b206c09896e \
427+
--hash=sha256:505831646c945e3e63552cc1b1b9b514f0e93232972a2d5bedbcc32f15bc82e3 \
428+
--hash=sha256:5170907244b14303edc5978f522f16c974f32d3aa92109fabc2af52411c9433b \
429+
--hash=sha256:55b4ea996a8e4458dd7b584a2f89863b1655dd3d17b88b46cbb9becc495a0ec5 \
430+
--hash=sha256:55e9d0118d97794367309635df398bdfd7c33b93e2fdfa0b239661cd74b4c14e \
431+
--hash=sha256:56a5595d0f892f214609c9f76b41b7428bed439d98dc961efafdd1354d42baae \
432+
--hash=sha256:57e7d17f59f9ebfa9667e6e5a1c0127b96b87cb9cede8335482451ed00788ba4 \
433+
--hash=sha256:5ef19071f4ac9f0834793af85bd04a920b4407715624e40cb7a0631a11137cdf \
434+
--hash=sha256:5ff818702440a5878a81886f127b80127f5d50563753a28211482867f8318106 \
435+
--hash=sha256:619843841e220adca114118533a574a9cd183ed8a28b85627d2844c500a2b0db \
436+
--hash=sha256:621f73a07595d83f28952d7bd1e91e9d1ed7625fb7af0064d3516674ec93a2a2 \
437+
--hash=sha256:693b465171707bbe882a7a05de5e866f33c76aa449750bee94a8d90463533cc9 \
438+
--hash=sha256:6bfc31a37fd1592f0c4fc4bfc674b5c42e52efe45b4b7a6a14f334cca4bcebe4 \
439+
--hash=sha256:6d220a2517f5893f55daac983bfa9fe998a7dbcaee4f5d27a88500f8b7873788 \
440+
--hash=sha256:6e42844ad64194fa08d5ccb75fe6a459b9b08e6d7296bd704460168d58a388f3 \
441+
--hash=sha256:726ea4e727aba21643205edad8f2187ec682d3305d790f73b7a51c7587b64bdd \
442+
--hash=sha256:74f45d170a21df41508cb67165456538425185baaf686281fa210d7e729abc34 \
443+
--hash=sha256:7dcc02368585334f5bc81fc73a2a6a0bbade60e7d83da21cead622faf408f32c \
444+
--hash=sha256:7e1e28be779884189cdd57735e997f282b64fd7ccf6e2eef3e16e57d7a34a815 \
445+
--hash=sha256:7ef7d5d4bd49ec7364315167a4134a015f61e8266c6d446fc116a9ac4456e10d \
446+
--hash=sha256:8050ba2e3ea1d8731a549e83c18d2f0999fbc99a5f6bd06b4c91449f55291804 \
447+
--hash=sha256:82345326b1d8d56afbe41d881fdf62f1926d7264b2fc1537f99ae5da9aad7913 \
448+
--hash=sha256:8355ad842a7c7e9e5e55653eade3b7d1885ba86f124dd8ab1f722f9be6627434 \
449+
--hash=sha256:86c1077a3cc60d453d4084d5b9649065f3bf1184e22992bd322e1f081d3117fb \
450+
--hash=sha256:87adf5bd6d72e3e17c9cb59ac4096b1faaf84b7eb3037a5ffa61c4b4370f0f13 \
451+
--hash=sha256:8db052bbd981e1666f09e957f3790ed74080c2229007c1dd67afdbf0b469c48b \
452+
--hash=sha256:8dd16fba2758db7a3780a051f245539c4451ca20910f5a5e6ea1c08d06d4a76b \
453+
--hash=sha256:8e32f7896f83774f91499d239e24cebfadbc07639c1494bb7213983842348337 \
454+
--hash=sha256:91c5036ebb62663a6b3999bdd2e559fd8456d17e2b485bf509784cd31a8b1705 \
455+
--hash=sha256:9250d087bc92b7d4899ccd5539a1b2334e44eee85d848c4c1aef8e221d3f8c8f \
456+
--hash=sha256:9479cae874c81bf610d72b85bb681a94c95722c127b55445285fb0e2c82db8e1 \
457+
--hash=sha256:968c14d4f03e10b2fd960f1d5168c1f0ac969381d3c1fcc973bc45fb06346599 \
458+
--hash=sha256:97499ff7862e868b1977107873dd1a06e151467129159a6ffd07b66706ba3a9f \
459+
--hash=sha256:99ad739c3686085e614bf77a508e26954ff1b8f14da0e3765ff7abbf7799f952 \
460+
--hash=sha256:9d787e3310c6a6425eb346be4ff2ccf6eece63017916fd77fe8328c57be83521 \
461+
--hash=sha256:a1774cd1981cd212506a23a14dba7fdeaee259f5deba2df6229966d9911e767a \
462+
--hash=sha256:a30a68e89e5a218b8b23a52292924c1f4b245cb0c68d1cce9aec9bbda6e2c160 \
463+
--hash=sha256:adc97a9077c2696501443d8ad3fa1b4fc6d131fc8fd7dfefd1a723f89071cf0a \
464+
--hash=sha256:b0d190e6f013ea938623a58706d1469a62103fb2a241ce2873a9906e0386582c \
465+
--hash=sha256:b10e42a6de0e32559a92f2f8dc908478cc0fa02838d7dbe764c44dca3fa13569 \
466+
--hash=sha256:b2a13dd6a95e95a489ca242319d18fc02e07ceb28fa9ad146385194d95b3c829 \
467+
--hash=sha256:b30bcbd1e1221783c721483953d9e4f3ab9c5d165aa709693d3f3946747b1aea \
468+
--hash=sha256:b325d4714c3c48277bfea1accd94e193ad6ed42b4bad79ad64f3b8f8a31260a5 \
469+
--hash=sha256:b5a28980a926fa810dbbed059547b02783952e2efd9c636412345232ddb87ff6 \
470+
--hash=sha256:b5f7d8d2867152cdb625e72a530d2ccb48a3d199159144cbdd63870882fb6f80 \
471+
--hash=sha256:bfb0d6be01fbae8d6655c8ca21b3b72458606c4aec9bbc932db758d47aba6db1 \
472+
--hash=sha256:bfd876041a956e6a90ad7cdb3f6a630c07d491280bfeed4544053cd434901681 \
473+
--hash=sha256:c08c1f3e34338256732bd6938747daa3c0d5b251e04b6e43b5813e94d503076e \
474+
--hash=sha256:c243da3436354f4af6c3058a3f81a97d47ea52c9bd874b52fd30274853a1d5df \
475+
--hash=sha256:c32bef3e7aeee75746748643667668ef941d28b003bfc89994ecf09a10f7a1b5 \
476+
--hash=sha256:c661fc820cfb33e166bf2450d3dadbda47c8d8981898adb9b6fe24e5e582ba60 \
477+
--hash=sha256:c6c4dcdfff2c08509faa15d36ba7e5ef5fcfab25f1e8f85a0c8f45bc3a30725d \
478+
--hash=sha256:c6c565d9a6e1a8d783c1948937ffc377dd5771e83bd56de8317c450a954d2056 \
479+
--hash=sha256:c8a154cf6537ebbc110e24dabe53095e714245c272da9c1be05734bdad4a61aa \
480+
--hash=sha256:c9c08c2fbc6120e70abff5d7f28ffb4d969e14294fb2143b4b5c7d20e46d1714 \
481+
--hash=sha256:ca89c5e596fc05b015f27561b3793dc2fa0917ea0d7507eebb448efd35274a70 \
482+
--hash=sha256:cc7cd0b2be0f0269283a45c0d8b2c35e149d1319dcb4a43c9c3689fa935c1ee6 \
483+
--hash=sha256:cda1ed70d2b264952e88adaa52eea653a33a1b98ac907ae2f86508eb44f65cdc \
484+
--hash=sha256:cf8ff04c642716a7f2048713ddc6278c5fd41faa3b9cab12607c7abecd012c22 \
485+
--hash=sha256:cfecdaa4b19f9ca534746eb3b55a5195d5c95b88cac32a205e981ec0a22b7d31 \
486+
--hash=sha256:d426616dae0967ca225ab12c22274eb816558f2f99ccb4a1d52ca92e8baf180f \
487+
--hash=sha256:d5eaa4a4c5b1906bd0d2508d68927f15b81821f85092e06f1a34a4254b0e1af3 \
488+
--hash=sha256:d639a750223132afbfb8f429c60d9d318aeba03281a5f1ab49f877456448dcf1 \
489+
--hash=sha256:d920392a6b1f353f4aa54328c867fec3320fa50657e25f64abf17af054fc97ac \
490+
--hash=sha256:d991483606f3dbec93287b9f35596f41aa2e92b7c2ebbb935b63f409e243c9af \
491+
--hash=sha256:d9ea2604370efc9a174c1b5dcc81784fb040044232150f7f33756049edfc9026 \
492+
--hash=sha256:dbaf3c3c37ef190439981648ccbf0c02ed99ae066087dd117fcb616d80b010a4 \
493+
--hash=sha256:dca3582bca82596609959ac39e12b7dad98385b4fefccb1151b937383cec547d \
494+
--hash=sha256:e3174a5ed4171570dc8318afada56373aa9289eb6dc0d96cceb48e7358b0e220 \
495+
--hash=sha256:e43a55f378df1e7a4fa3547c88d9a5a9b7113f653a66821bcea4718fe6c58763 \
496+
--hash=sha256:e69d0deeb977ffe7ed3d2e4439360089f9c3f217ada608f0f88ebd67afb6385e \
497+
--hash=sha256:e85dc94595f4d766bd7d872a9de5ede1ca8d3063f3bdf1e2c725f5eb411159e3 \
498+
--hash=sha256:e90b8db97f6f2c97eb045b51a6b2c5ed69cedd8392459e0642d4199b94fabd7e \
499+
--hash=sha256:e9bf3f0bbdb56633c07d7116ae60a576f846efdd86a8848f8d62b749e1209ca7 \
500+
--hash=sha256:ea4e6b3566127fda5e007e90a8fd5a4169f0cf0619506ed426db647f19c8454a \
501+
--hash=sha256:ec94c04149b6a7b8120f9f44565722c7ae31b7a6d2275569d2eefa76b83da3be \
502+
--hash=sha256:eddf73f41225942c1f994914742afa53dc0d01a6e20fe14b878a1b1edc74151f \
503+
--hash=sha256:ee6854c9000a10938c79238de2379bea30c82e4925a371711af45387df35cab8 \
504+
--hash=sha256:ef71d476caa6692eea743ae5ea23cde3260677f70122c4d258ca952e5c2d4e84 \
505+
--hash=sha256:f052d1be37ef35a54e394de66136e30fa1191fab64f71fc06ac7bc98c9a84618 \
506+
--hash=sha256:f1862739a1ffb50615c0fde6bae6569b5efbe08d98e59ce009f68a336f64da75 \
507+
--hash=sha256:f192a831d9575271a22d804ff1a5355355723f94f31d9eef25f0d45a152fdc1a \
508+
--hash=sha256:f42e68301ff4afee63e365a5fc302b81bb8ba31af625a671d7acb19d10168a8c \
509+
--hash=sha256:f7792f27d3ee6e0244ea4697d92b825f9a329ab5230a78c1a68bd274e64b5077 \
510+
--hash=sha256:f82110ab962a541737bd0ce87978d4c658f06e7591ba899192e2712a517badbb \
511+
--hash=sha256:f9ca1cbdc0fbfe5e6e6f8221ef2309988db5bcede52443aeaee9a4ad555e0dac \
512+
--hash=sha256:fd65af65e2aaf9474e468f9e571bd7b189e1df3a61caa59dcbabd0000e4ea839 \
513+
--hash=sha256:fe2fda4110a3d0bc163c2e0664be44657431440722c5c5315c65155cab92f9e5 \
514+
--hash=sha256:febd38857b09867d3ed3f4f1af7d241c5c50362e25ef43034995b77a50df494e
383515
# amqp is required by kombu
384516
amqp==5.3.1 \
385517
--hash=sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2 \

src/olympia/amo/utils.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -492,19 +492,25 @@ def build_characters_normalization_replacement_table(categories_to_strip):
492492
# override symbols we would strip if we only used the table generated
493493
# above.
494494
additional_replacements = {
495-
'c': ('¢',),
495+
'a': ('Ѧ', 'ѧ', 'Ꙙ', 'ꙙ'),
496+
'b': ('В', 'Ъ', 'в', 'ъ', 'ь', 'Ꙏ', 'ꙏ'),
497+
'c': ('¢', 'Ҫ', 'ҫ'),
496498
'd': ('ð', 'đ'),
497-
'e': ('Ε', 'ε', 'Є', 'Э', '€', '℈', 'Ꞓ', 'ꭼ'),
498-
'f': ('ƒ', 'ϝ'),
499-
'h': ('ħ',),
499+
'e': ('Ε', 'ε', 'Є', 'Э', '€', '℈', 'Ꞓ', 'ꭼ', 'з', 'э', 'є', 'Ҽ', 'Ҿ', 'ҿ'),
500+
'f': ('ƒ', 'ϝ', 'ғ', 'Ғ'),
501+
'h': ('ħ', 'Ђ', 'н', 'Ң', 'ң', 'Ҥ', 'ҥ', 'Һ', 'Ӈ', 'ӈ', 'Ӊ', 'ӊ'),
500502
'i': ('ı',),
501-
'k': ('ĸ', 'κ', 'к', 'қ', 'ҝ', 'ҟ', 'ҡ', 'ᴋ'),
503+
'k': ('ĸ', 'κ', 'к', 'қ', 'ҝ', 'ҟ', 'ҡ', 'ᴋ', 'Қ', 'Ҝ', 'Ҟ', 'Ҡ', 'Ӄ', 'ӄ'),
502504
'l': ('ł', 'ꙇ', '𐑃'),
503-
'm': ('ʍ', 'м', 'ᴍ', 'ꮇ'),
504-
'o': ('ø', 'Ѻ', 'ѻ'),
505-
's': ('ѕ',),
506-
't': ('ŧ', 'τ', 'т', 'ᴛ', '⊤', 'Ꚍ', 'ꚍ', 'Ꚑ', 'ꚑ', 'ꞇ'),
507-
'w': ('ω', 'ш'),
505+
'm': ('ʍ', 'м', 'ᴍ', 'ꮇ', 'Ӎ', 'ӎ', 'Ꙧ', 'ꙧ'),
506+
'n': ('И', 'и'),
507+
'o': ('ø', 'Ѻ', 'ѻ', 'ѳ', 'ꙩ'),
508+
'r': ('Я', 'я'),
509+
's': ('ѕ', 'ꙅ', 'Ꙅ', 'Ꚃ', 'ꚃ'),
510+
't': ('ŧ', 'τ', 'т', 'ᴛ', '⊤', 'Ꚍ', 'ꚍ', 'Ꚑ', 'ꚑ', 'ꞇ', 'Ҭ', 'ҭ'),
511+
'w': ('ω', 'ш', 'Ш', 'Щ', 'щ', 'Ѡ', 'ѿ', 'Ꚗ', 'ꚗ'),
512+
'x': ('Ҳ', 'ҳ', 'Ӽ', 'ӽ', 'Ӿ', 'ӿ'),
513+
'y': ('Ұ', 'ұ'),
508514
}
509515
additional_replacement_table = dict(
510516
itertools.chain(

src/olympia/scanners/tasks.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import itertools
22
import json
33
import os
4-
import re
54
import uuid
65
from collections import defaultdict
76

87
from django.conf import settings
98
from django.db.models import F
109

10+
import regex
1111
import requests
1212
import waffle
1313
import yara
@@ -348,7 +348,9 @@ def _run_narc(*, scanner_result, version, rules=None):
348348

349349
# Run each rule on the values we've accumulated.
350350
for rule in rules:
351-
definition = re.compile(str(rule.definition), re.I)
351+
# We're using `regex`, which is faster/more powerful than the default
352+
# `re` module.
353+
definition = regex.compile(str(rule.definition), regex.I | regex.E)
352354
for value, sources in values.items():
353355
value = str(value)
354356
variants = [(value, None)]

0 commit comments

Comments
 (0)