Skip to content

Commit c03da2a

Browse files
committed
perl.h: don't include charclass_invlists.h indiscriminately
charclass_invlists.h is a generated file with a lot of code (4.5 MB). The vast majority of this code is skipped (masked by #ifdef .. #endif), but since perl.h includes it, the preprocessor has to slog through all 4.5 MB every time a *.c core file is compiled, slowing down the build. This also affects XS modules, which normally include perl.h. This commit rearranges things a bit in order to speed up compilation. For details, see below. Conceptually speaking, charclass_invlists.h consists of 4 parts: 1. Code only active in regcomp.c 2. Code only active in regexec.c 3. Code only active in utf8.c 4. Other code (always active) As it turns out, part 4 consists of two constants (NUM_ANYOF_CODE_POINTS and MAX_FOLD_FROMS) and nothing else. Furthermore, these are only needed in regexp.h. This commit splits off part 4 into a new header file (regexp_constants.h), which (like charclass_invlists.h) is generated by regen/mk_invlists.pl and included by regexp.h, its only consumer. Ideally, the rest of charclass_invlists.h (parts 1-3) should not be included in perl.h, but only in regcomp.c/regexec.c/utf8.c. However, this causes problems in practice: The generated code uses symbols like NULL, U8, or U32, so it cannot be included before perl.h, but it defines types that are needed by other headers (proto.h), so it cannot be included after perl.h either. I couldn't figure out how to disentangle this sanely, so now perl.h still includes charclass_invlists.h, but hidden behind an #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_UTF8_C) guard. This way most consumers of perl.h will never have to touch charclass_invlists.h. Fixes #22678.
1 parent 83c3e4c commit c03da2a

File tree

10 files changed

+113
-36
lines changed

10 files changed

+113
-36
lines changed

MANIFEST

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ regen.pl Run all scripts that (re)generate files
176176
regen_perly.pl generate perly.{act,h,tab} from perly.y
177177
regexec.c Regular expression evaluator
178178
regexp.h Public declarations for the above
179+
regexp_constants.h Generated by regen/mk_invlists.pl
179180
reginline.h Inline subs for the RE engine.
180181
regnodes.h Description of nodes of the RE engine
181182
run.c The interpreter loop

Makefile.SH

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,8 @@ h = \
534534
handy.h hv.h hv_func.h iperlsys.h keywords.h l1_char_class_tab.h \
535535
mg.h mydtrace.h op.h op_reg_common.h opcode.h pad.h patchlevel.h \
536536
perl.h perlapi.h perly.h pp.h proto.h regcomp.h regcomp_internal.h \
537-
regexp.h scope.h sv.h thread.h unixish.h utf8.h util.h warnings.h \
537+
regexp.h regexp_constants.h scope.h sv.h thread.h unixish.h utf8.h \
538+
util.h warnings.h \
538539
$(CONFIGH)
539540
540541
c_base = \

charclass_invlists.h

Lines changed: 1 addition & 24 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/unicore/uni_keywords.pl

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

perl.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3307,7 +3307,9 @@ typedef struct padname PADNAME;
33073307
#endif
33083308

33093309
#include "handy.h"
3310-
#include "charclass_invlists.h"
3310+
#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_UTF8_C)
3311+
# include "charclass_invlists.h"
3312+
#endif
33113313

33123314
#if defined(USE_LARGE_FILES) && !defined(NO_64_BIT_RAWIO)
33133315
# if LSEEKSIZE == 8 && !defined(USE_64_BIT_RAWIO)

regen/mk_invlists.pl

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#!perl -w
2-
use 5.015;
3-
use strict;
1+
#!perl
2+
use v5.16;
43
use warnings;
54
use Unicode::UCD qw(prop_aliases
65
prop_values
@@ -59,13 +58,20 @@
5958
{style => '*', by => 'regen/mk_invlists.pl',
6059
from => "Unicode::UCD"});
6160

61+
my $regexp_constants_fh = open_new('regexp_constants.h', '>',
62+
{style => '*', by => 'regen/mk_invlists.pl',
63+
from => "Unicode::UCD"});
64+
6265
my $in_file_pound_if = "";
6366

6467
my $max_hdr_len = 3; # In headings, how wide a name is allowed?
6568

66-
print $out_fh "/* See the generating file for comments */\n\n";
67-
6869
print $out_fh <<'EOF';
70+
/* See the generating file for comments */
71+
72+
EOF
73+
74+
print $regexp_constants_fh <<'EOF';
6975
/* This gives the number of code points that can be in the bitmap of an ANYOF
7076
* node. The shift number must currently be one of: 8..12. It can't be less
7177
* than 8 (256) because some code relies on it being at least that. Above 12
@@ -84,7 +90,7 @@
8490

8591
my $num_anyof_code_points = '(1 << 8)';
8692

87-
print $out_fh "#define NUM_ANYOF_CODE_POINTS $num_anyof_code_points\n\n";
93+
print $regexp_constants_fh "#define NUM_ANYOF_CODE_POINTS $num_anyof_code_points\n\n";
8894

8995
$num_anyof_code_points = eval $num_anyof_code_points;
9096

@@ -3300,7 +3306,7 @@ sub token_name
33003306

33013307
print "Computing fold data\n" if DEBUG;
33023308

3303-
print $out_fh <<"EOF";
3309+
print $regexp_constants_fh <<"EOF";
33043310
33053311
/* More than one code point may have the same code point as their fold. This
33063312
* gives the maximum number in the current Unicode release. (The folded-to
@@ -3337,6 +3343,8 @@ sub token_name
33373343
}
33383344
}
33393345

3346+
read_only_bottom_close_and_rename($regexp_constants_fh, \@sources);
3347+
33403348
read_only_bottom_close_and_rename($out_fh, \@sources);
33413349

33423350
my %name_to_index;

regexp.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,8 @@ and check for NULL.
680680

681681
/* Stuff that needs to be included in the pluggable extension goes below here */
682682

683+
#include "regexp_constants.h"
684+
683685
#ifdef PERL_ANY_COW
684686
# define RXp_MATCH_COPY_FREE(prog) \
685687
STMT_START { \

regexp_constants.h

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/* -*- mode: C; buffer-read-only: t -*-
2+
* !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
3+
* This file is built by regen/mk_invlists.pl from Unicode::UCD.
4+
* Any changes made here will be lost!
5+
*/
6+
7+
/* This gives the number of code points that can be in the bitmap of an ANYOF
8+
* node. The shift number must currently be one of: 8..12. It can't be less
9+
* than 8 (256) because some code relies on it being at least that. Above 12
10+
* (4096), and you start running into warnings that some data structure widths
11+
* have been exceeded, though the test suite as of this writing still passes
12+
* for up through 16, which is as high as anyone would ever want to go,
13+
* encompassing all of the Unicode BMP, and thus including all the economically
14+
* important world scripts. At 12 most of them are: including Arabic,
15+
* Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
16+
* Japanese, nor Korean. The regnode sizing data structure in regnodes.h currently
17+
* uses a U8, and the trie types TRIEC and AHOCORASICKC are larger than U8 for
18+
* shift values above 12.) Be sure to benchmark before changing, as larger sizes
19+
* do significantly slow down the test suite. */
20+
21+
#define NUM_ANYOF_CODE_POINTS (1 << 8)
22+
23+
24+
/* More than one code point may have the same code point as their fold. This
25+
* gives the maximum number in the current Unicode release. (The folded-to
26+
* code point is not included in this count.) For example, both 'S' and
27+
* \x{17F} fold to 's', so the number for that fold is 2. Another way to
28+
* look at it is the maximum length of all the IVCF_AUX_TABLE's */
29+
#define MAX_FOLD_FROMS 3
30+
31+
/* Generated from:
32+
* 0e8307ab7c654d9c133ea885f5413a4eb5c0123ed2178f7e1cbabed36b67792c lib/Unicode/UCD.pm
33+
* eb840f36e0a7446293578c684a54c6d83d249abde7bdd4dfa89794af1d7fe9e9 lib/unicore/ArabicShaping.txt
34+
* 333ae1e99db0504ca8a046a07dc45b5e7aa91869c685e6bf955ebe674804827a lib/unicore/BidiBrackets.txt
35+
* b4b9e1d87d8ea273613880de9d2b2f0b0b696244b42152bfa0a3106e7d983a20 lib/unicore/BidiMirroring.txt
36+
* 529dc5d0f6386d52f2f56e004bbfab48ce2d587eea9d38ba546c4052491bd820 lib/unicore/Blocks.txt
37+
* cdd49e55eae3bbf1f0a3f6580c974a0263cb86a6a08daa10fbf705b4808a56f7 lib/unicore/CaseFolding.txt
38+
* 3b019c0a33c3140cbc920c078f4f9af2680ba4f71869c8d4de5190667c70b6a3 lib/unicore/CompositionExclusions.txt
39+
* 7570877e0fa197c45338f7c41a02636da4e14c8dba6a3611a01cd30bf329d5ca lib/unicore/DAge.txt
40+
* d367290bc0867e6b484c68370530bdd1a08b6b32404601b8c7accaf83e05628d lib/unicore/DCoreProperties.txt
41+
* d5687a48c95c7d6e1ec59cb29c0f2e8b052018eb069a4371b7368d0561e12a29 lib/unicore/DNormalizationProps.txt
42+
* 743e7bc435c04ab1a8459710b1c3cad56eedced5b806b4659b6e69b85d0adf2a lib/unicore/EastAsianWidth.txt
43+
* f2e04bae8c856fad3a16353a99d4cc2de6c72770260379f5e4974a97548aad2a lib/unicore/EquivalentUnifiedIdeograph.txt
44+
* 9a3ab36d36a22bdb84de7a17b17e9b9c242134f0080f0a8b4b28d209465a8fc8 lib/unicore/HangulSyllableType.txt
45+
* 790bc9595795c0e0a3860a21a7f97157a134b61a4fc4ab03c7d315d07c9a6eb7 lib/unicore/IdStatus.txt
46+
* 71d3ed8f15cd5d8cd00cdebe62015ff26356462774b261b4a2b83d3bf46b1639 lib/unicore/IdType.txt
47+
* 0ce56c1294da405c0a0a0071582ac839fd229bbf97bdd260462ee571309d4ec4 lib/unicore/IndicPositionalCategory.txt
48+
* ffae561a51b47ddbbe267fdd8505ac3776b85b2932268809127acee84200b573 lib/unicore/IndicSyllabicCategory.txt
49+
* 14733bcb6731ae0c07485bf59a41cb3db08785a50bd2b46b836b4341eab7ee46 lib/unicore/Jamo.txt
50+
* 012bca868e2c4e59a5a10a7546baf0c6fb1b2ef458c277f054915c8a49d292bf lib/unicore/LineBreak.txt
51+
* 3e39509e8fae3e5d50ba73759d0b97194501d14a9c63107a6372a46b38be18e8 lib/unicore/NameAliases.txt
52+
* 1d5202155f14841973aa540b1625f4befbde185ac77ce5aceaaaa0501a68bd66 lib/unicore/NamedSequences.txt
53+
* fb9ac8cc154a80cad6caac9897af55a4e75176af6f4e2bb6edc2bf8b1d57f326 lib/unicore/NormTest.txt
54+
* e05c0a2811d113dae4abd832884199a3ea8d187ee1b872d8240a788a96540bfd lib/unicore/PropList.txt
55+
* 13a7666843abea5c6b7eb8c057c57ab9bb2ba96cfc936e204224dd67d71cafad lib/unicore/PropValueAliases.txt
56+
* e4935149af407fa455901832b710bccb63d2453e46d09190e234d019bcfbba45 lib/unicore/PropertyAliases.txt
57+
* 7e07313d9d0bee42220c476b64485995130ae30917bbcf7780b602d677d7e33f lib/unicore/ScriptExtensions.txt
58+
* cca85d830f46aece2e7c1459ef1249993dca8f2e46d51e869255be140d7ea4b0 lib/unicore/Scripts.txt
59+
* 78b29c64b5840d25c11a9f31b665ee551b8a499eca6c70d770fcad7dd710f494 lib/unicore/SpecialCasing.txt
60+
* 806e9aed65037197f1ec85e12be6e8cd870fc5608b4de0fffd990f689f376a73 lib/unicore/UnicodeData.txt
61+
* ca6d332f485a6f5f452b29b4a74146af0f2c17b7577aa4c821d597210f70611a lib/unicore/VerticalOrientation.txt
62+
* 0d2080d0def294a4b7660801cc03ddfe5866ff300c789c2cc1b50fd7802b2d97 lib/unicore/auxiliary/GCBTest.txt
63+
* 5a0f8748575432f8ff95e1dd5bfaa27bda1a844809e17d6939ee912bba6568a1 lib/unicore/auxiliary/GraphemeBreakProperty.txt
64+
* 371bde4052aa593b108684ae292d8ea2dbb93c19990e0cdf416fa7239557aac3 lib/unicore/auxiliary/LBTest.txt
65+
* f62279d8fd10935ba0cf0d8417a1dcbe7ab0d4e62f59c17e02cbe40f580c4162 lib/unicore/auxiliary/SBTest.txt
66+
* 61e4ba975b0a5bc1a76ee931b94914395d7289ef624e3c0d4d6b9460ee387bea lib/unicore/auxiliary/SentenceBreakProperty.txt
67+
* 2a676130c71194245e7c74a837e58330f202600d8ddcf4518129dd476f26e18e lib/unicore/auxiliary/WBTest.txt
68+
* 5188a56e91593467c2e912601ebc78750e6adc9b04541b8c5becb5441e388ce2 lib/unicore/auxiliary/WordBreakProperty.txt
69+
* 29071dba22c72c27783a73016afb8ffaeb025866740791f9c2d0b55cc45a3470 lib/unicore/emoji/emoji.txt
70+
* 4841f2090c2dbc592d3ce43bb74c2191b3da50fb9a0d00274f1448c202851b02 lib/unicore/extracted/DBidiClass.txt
71+
* f10a35451429137f7348825f22d624b6390c526ead3d8e756d2af9e5ed5b2b67 lib/unicore/extracted/DBinaryProperties.txt
72+
* ca54f6360cd288ad92113415bf1f77749015abe11cbd6798d21f7fa81f04205d lib/unicore/extracted/DCombiningClass.txt
73+
* db059ce45e3cec49bfda56e262fa658b3a5561b1648de266c818d2a08a85b78a lib/unicore/extracted/DDecompositionType.txt
74+
* d62e6950f086e53f47c593a38342621f8838f48c49a1de070cf83d3959bd1688 lib/unicore/extracted/DEastAsianWidth.txt
75+
* fe29a45c0882500e591140aaa5c4f5067e6a5d746806148af34400c48b9c06f9 lib/unicore/extracted/DGeneralCategory.txt
76+
* e13ca1344b16023aa38c6ada39f9658536fc6bb7c3c24d579f0bc316a4f4f1e0 lib/unicore/extracted/DJoinGroup.txt
77+
* c4870b11e2b8b7d0eb70b99ce85608e5c28a399efa316cca97238a58ae160e5e lib/unicore/extracted/DJoinType.txt
78+
* 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt
79+
* 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt
80+
* 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt
81+
* 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables
82+
* 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version
83+
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
84+
* c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl
85+
* e7ba2c6c0577fbb8b767a1305dbebcfeec166d11aa010cfbad9001c5f5971ee6 regen/mk_invlists.pl
86+
* ex: set ro ft=c: */

t/porting/regen.t

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ my $tests = 28; # I can't see a clean way to calculate this automatically.
3131

3232
my %skip = ("regen_perly.pl" => [qw(perly.act perly.h perly.tab)],
3333
"regen/keywords.pl" => [qw(keywords.c keywords.h)],
34-
"regen/mk_invlists.pl" => [qw(charclass_invlists.h uni_keywords.h)],
34+
"regen/mk_invlists.pl" => [qw(charclass_invlists.h regexp_constants.h uni_keywords.h)],
3535
"regen/regcharclass.pl" => [qw(regcharclass.h)],
3636
);
3737

uni_keywords.h

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)