Skip to content

Commit e388c29

Browse files
[CDRIVER-5983] Refactor String Handling Around URI Parsing (#2047)
* `mlib/str.h` - String utilities This commit adds `mlib_str_view` and utlities for manipulating sized string views. * Fix docs builds with older Sphinx 7.1 * Error utilities for clearing/reseting an error obj * A more robust integer parsing function * "because" assertions * Case normalization in mlib/str * Use sized strings and algos throughout URI parsing This greatly reduces the number of allocated strings to manage, and simplifies parsing operations. - Split the URI into all components before trying to apply any of those to the URI object. - Update several internal APIs to pass sized string views, reducing the requirement to pass null-terminated strings, and reduing the number of redundant `strlen` calls. - `mstr_split_at/around` greatly simplifies parsing of delimited strings. - Put case normalization at a lower level to reduce need to case-fold strings which necessitates a strdup. Instead, use case-insensitive compares in more locations. - Behavior change: Setting compressors to empty string `""` now clears compressors rather than it being an error. * Refactor %-encoding and integer parsing - %-decoding now indicates the position and kind of error to the caller. - %-decoding doesn't use sscanf - %-decoding allocates the full string up-front - Error messages related to %-decoding now explain the problem. - Use new sized integer parsing functions. * Allow passing an error string to reformat itself Allow passing `error.message` as an input to `_mongoc_set_error` by storing the temporary format output in a temporary buffer, then copying that over the `error.message` * uninit vars * Simplify parsing of host specifiers This changes host parsing to use sized strings, and adds more specific error messages in case of parse failures. * Free the wtag on setting it
1 parent d3f6d1a commit e388c29

33 files changed

+1909
-874
lines changed

build/sphinx/mongoc_common.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from sphinx.application import Sphinx
1010
from sphinx.application import logger as sphinx_log
11+
1112
try:
1213
from sphinx.builders.dirhtml import DirectoryHTMLBuilder
1314
except ImportError:
@@ -16,7 +17,8 @@
1617
from sphinx.config import Config
1718
from docutils.parsers.rst import Directive
1819

19-
needs_sphinx = "1.7" # Do not require newer sphinx. EPEL packages build man pages with Sphinx 1.7.6. Refer: CDRIVER-4767
20+
# Do not require newer sphinx. EPEL packages build man pages with Sphinx 1.7.6. Refer: CDRIVER-4767
21+
needs_sphinx = "1.7"
2022
author = "MongoDB, Inc"
2123

2224
# -- Options for HTML output ----------------------------------------------
@@ -38,7 +40,8 @@ def _file_man_page_name(fpath: Path) -> Union[str, None]:
3840
continue
3941
return mat[1]
4042

41-
def _collect_man (app: Sphinx):
43+
44+
def _collect_man(app: Sphinx):
4245
# Note: 'app' is partially-formed, as this is called from the Sphinx.__init__
4346
docdir = Path(app.srcdir)
4447
# Find everything:
@@ -61,6 +64,7 @@ def _collect_man (app: Sphinx):
6164
assert docname, filepath
6265
man_pages.append((docname, man_name, "", [author], 3))
6366

67+
6468
# -- Options for manual page output ---------------------------------------
6569

6670
# NOTE: This starts empty, but we populate it in `setup` in _collect_man() (see above)
@@ -168,6 +172,7 @@ def generate_html_redirs(app: Sphinx, page: str, templatename: str, context: Dic
168172
builder.css_files[:] = prev_css
169173
sphinx_log.debug("Wrote redirect: %r -> %r", path, page)
170174

175+
171176
def mongoc_common_setup(app: Sphinx):
172177
_collect_man(app)
173178
app.connect("html-page-context", generate_html_redirs)

src/common/src/mlib/intencode.h

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,11 @@
2121

2222
#include <mlib/config.h>
2323
#include <mlib/loop.h>
24+
#include <mlib/str.h>
2425

26+
#include <errno.h>
2527
#include <stdint.h>
28+
#include <stdlib.h>
2629
#include <string.h>
2730

2831
/**
@@ -165,3 +168,183 @@ mlib_write_f64le (void *out, double d)
165168
memcpy (&bits, &d, sizeof d);
166169
return mlib_write_u64le (out, bits);
167170
}
171+
172+
/**
173+
* @brief Decode a 64-bit natural number
174+
*
175+
* @param in The input string to be decoded. Does not support a sign or base prefix!
176+
* @param base The base to be decoded. Must not be zero!
177+
* @param out Pointer that receives the decoded value
178+
* @return int A result code for the operation.
179+
*
180+
* See `mlib_i64_parse` for more details.
181+
*/
182+
static inline int
183+
mlib_nat64_parse (mstr_view in, unsigned base, uint64_t *out)
184+
{
185+
if (in.len == 0) {
186+
// Empty string is not valid
187+
return EINVAL;
188+
}
189+
190+
// Accummulate into this value:
191+
uint64_t value = 0;
192+
// Whether any operation in the parse overflowed the integer value
193+
bool did_overflow = false;
194+
// Loop until we have consumed the full string, or encounter an invalid digit
195+
while (in.len) {
196+
// Shift place value for another digit
197+
did_overflow = mlib_mul (&value, base) || did_overflow;
198+
// Case-fold for alpha digits
199+
int32_t digit = mlib_latin_tolower (in.data[0]);
200+
unsigned digit_value = 0;
201+
// Only standard digits
202+
if (digit >= '0' && digit <= '9') {
203+
// Normal digit
204+
digit_value = (unsigned) (digit - '0');
205+
} else if (digit >= 'a' && digit <= 'z') {
206+
// Letter digits
207+
digit_value = (unsigned) (digit - 'a') + 10;
208+
} else {
209+
// Not a valid alnum digit
210+
return EINVAL;
211+
}
212+
if (digit_value >= base) {
213+
// The digit value is out-of-range for our chosen base
214+
return EINVAL;
215+
}
216+
// Accumulate the new digit value
217+
did_overflow = mlib_add (&value, digit_value) || did_overflow;
218+
// Jump to the next digit in the string
219+
in = mstr_substr (in, 1);
220+
}
221+
222+
if (did_overflow) {
223+
return ERANGE;
224+
}
225+
226+
(void) (out && (*out = value));
227+
return 0;
228+
}
229+
230+
/**
231+
* @brief Parse a string as a 64-bit signed integer
232+
*
233+
* @param in The string of digits to be parsed.
234+
* @param base Optional: The base to use for parsing. Use "0" to infer the base.
235+
* @param out Optional storage for an int64 value to be updated with the result
236+
* @return int Returns an errno value for the parse
237+
*
238+
* - A value of `0` indicates that the parse was successful.
239+
* - A value of `EINVAL` indicates that the input string is not a valid
240+
* representation of an integer.
241+
* - A value of `ERANGE` indicates that the input string is a valid integer,
242+
* but the actual encoded value cannot be represented in an `int64_t`
243+
* - If the parse fails (returns non-zero), then the value at `*out` will remain
244+
* unmodified.
245+
*
246+
* This differs from `strtoll` in that it requires that the entire string be
247+
* parsed as a valid integer. If parsing stops early, then the result will indicate
248+
* an error of EINVAL.
249+
*/
250+
static inline int
251+
mlib_i64_parse (mstr_view in, unsigned base, int64_t *out)
252+
{
253+
if (in.len == 0) {
254+
// Empty string is not a valid integer
255+
return EINVAL;
256+
}
257+
// Parse the possible sign prefix
258+
int sign = 1;
259+
// Check for a "+"
260+
if (in.data[0] == '+') {
261+
// Just a plus. Drop it and do nothing with it.
262+
in = mstr_substr (in, 1);
263+
}
264+
// Check for a negative prefix
265+
else if (in.data[0] == '-') {
266+
// Negative sign. We'll negate the value later.
267+
in = mstr_substr (in, 1);
268+
sign = -1;
269+
}
270+
271+
// Infer the base value, if we have one
272+
if (base == 0) {
273+
if (in.len && in.data[0] == '0') {
274+
if (in.len > 1) {
275+
if (mlib_latin_tolower (in.data[1]) == 'x') {
276+
// Hexadecimal
277+
base = 16;
278+
in = mstr_substr (in, 2);
279+
} else if (mlib_latin_tolower (in.data[1]) == 'o') {
280+
// Octal
281+
base = 8;
282+
in = mstr_substr (in, 2);
283+
} else if (mlib_latin_tolower (in.data[1]) == 'b') {
284+
// Binary
285+
base = 2;
286+
in = mstr_substr (in, 2);
287+
}
288+
}
289+
if (base == 0) {
290+
// Other: Octal with a single "0" prefix. Don't trim this, because
291+
// it may be a literal "0"
292+
base = 8;
293+
}
294+
} else {
295+
// No '0' prefix. Treat it as decimal
296+
base = 10;
297+
}
298+
}
299+
300+
// Try to parse the natural number now that we have removed all prefixes and
301+
// have a non-zero base.
302+
uint64_t nat;
303+
int rc = mlib_nat64_parse (in, base, &nat);
304+
if (rc) {
305+
return rc;
306+
}
307+
308+
// Try to narrow from the u64 to i64 and apply the sign. This must be done as
309+
// one operation because of the pathological case of parsing INT64_MIN
310+
int64_t i64 = 0;
311+
if (mlib_mul (&i64, nat, sign)) {
312+
return ERANGE;
313+
}
314+
315+
(void) (out && (*out = i64));
316+
return 0;
317+
}
318+
319+
#define mlib_i64_parse(...) MLIB_ARGC_PICK (_mlib_i64_parse, __VA_ARGS__)
320+
#define _mlib_i64_parse_argc_2(S, Ptr) _mlib_i64_parse_argc_3 ((S), 0, (Ptr))
321+
#define _mlib_i64_parse_argc_3(S, Base, Ptr) mlib_i64_parse (mstr_view_from ((S)), Base, Ptr)
322+
323+
/**
324+
* @brief Parse a 32-bit integer from a string.
325+
*
326+
* See `mlib_i64_parse` for more details.
327+
*/
328+
static inline int
329+
mlib_i32_parse (mstr_view in, unsigned base, int32_t *out)
330+
{
331+
int64_t tmp;
332+
int ec = mlib_i64_parse (in, base, &tmp);
333+
if (ec) {
334+
// Failed to parse the int64 value.
335+
return ec;
336+
}
337+
// Attempt to narrow to a 32-bit value
338+
int32_t i32 = 0;
339+
if (mlib_narrow (&i32, tmp)) {
340+
// Value is out-of-range
341+
return ERANGE;
342+
}
343+
// Success
344+
(void) (out && (*out = i32));
345+
return 0;
346+
}
347+
348+
#define mlib_i32_parse(...) MLIB_ARGC_PICK (_mlib_i32_parse, __VA_ARGS__)
349+
#define _mlib_i32_parse_argc_2(S, Ptr) _mlib_i32_parse_argc_3 ((S), 0, (Ptr))
350+
#define _mlib_i32_parse_argc_3(S, Base, Ptr) mlib_i32_parse (mstr_view_from ((S)), Base, Ptr)

0 commit comments

Comments
 (0)