Skip to content

Commit 9086d9b

Browse files
Add naming validation for flow identifiers (#779) (#783)
* Add naming validation for flow identifiers - Implement validation module for flow names, field names, and targets - Enforce 64-char limit and alphanumeric+underscore restrictions - Prevent double-underscore prefixes (reserved for internal use) - Add comprehensive test coverage with 11 test cases - Maintain backward compatibility Fixes #779 * Address PR review feedback - Remove validation functions from public API (internal use only) - Simplify exception handling - let NamingError be raised directly - Pre-compile regex patterns for performance optimization - Add field name validation in DataScope.__setitem__ - Add target name validation in DataCollector.export * Refactor: Move validation imports to top-level and ensure app_namespace and full_flow_name validation * Refactor: Move flow name validation to Flow.__init__
1 parent 0b04d2e commit 9086d9b

File tree

4 files changed

+257
-7
lines changed

4 files changed

+257
-7
lines changed

python/cocoindex/flow.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@
1010
import inspect
1111
import re
1212

13+
from .validation import (
14+
validate_flow_name,
15+
NamingError,
16+
validate_full_flow_name,
17+
validate_target_name,
18+
)
19+
1320
from dataclasses import dataclass
1421
from enum import Enum
1522
from threading import Lock
@@ -300,6 +307,9 @@ def __getitem__(self, field_name: str) -> DataSlice[T]:
300307
)
301308

302309
def __setitem__(self, field_name: str, value: DataSlice[T]) -> None:
310+
from .validation import validate_field_name
311+
312+
validate_field_name(field_name)
303313
value._state.attach_to_scope(self._engine_data_scope, field_name)
304314

305315
def __enter__(self) -> DataScope:
@@ -367,7 +377,7 @@ def collect(self, **kwargs: Any) -> None:
367377

368378
def export(
369379
self,
370-
name: str,
380+
target_name: str,
371381
target_spec: op.TargetSpec,
372382
/,
373383
*,
@@ -381,6 +391,8 @@ def export(
381391
382392
`vector_index` is for backward compatibility only. Please use `vector_indexes` instead.
383393
"""
394+
395+
validate_target_name(target_name)
384396
if not isinstance(target_spec, op.TargetSpec):
385397
raise ValueError(
386398
"export() can only be called on a CocoIndex target storage"
@@ -398,7 +410,7 @@ def export(
398410
vector_indexes=vector_indexes,
399411
)
400412
self._flow_builder_state.engine_flow_builder.export(
401-
name,
413+
target_name,
402414
_spec_kind(target_spec),
403415
dump_engine_object(target_spec),
404416
dump_engine_object(index_options),
@@ -660,6 +672,8 @@ class Flow:
660672
def __init__(
661673
self, name: str, full_name: str, engine_flow_creator: Callable[[], _engine.Flow]
662674
):
675+
validate_flow_name(name)
676+
validate_full_flow_name(full_name)
663677
self._name = name
664678
self._full_name = full_name
665679
engine_flow = None
@@ -831,11 +845,6 @@ def get_flow_full_name(name: str) -> str:
831845

832846

833847
def add_flow_def(name: str, fl_def: Callable[[FlowBuilder, DataScope], None]) -> Flow:
834-
"""Add a flow definition to the cocoindex library."""
835-
if not all(c.isalnum() or c == "_" for c in name):
836-
raise ValueError(
837-
f"Flow name '{name}' contains invalid characters. Only alphanumeric characters and underscores are allowed."
838-
)
839848
with _flows_lock:
840849
if name in _flows:
841850
raise KeyError(f"Flow with name {name} already exists")

python/cocoindex/setting.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from typing import Callable, Self, Any, overload
88
from dataclasses import dataclass
9+
from .validation import validate_app_namespace_name
910

1011
_app_namespace: str = ""
1112

@@ -27,6 +28,8 @@ def split_app_namespace(full_name: str, delimiter: str) -> tuple[str, str]:
2728

2829
def set_app_namespace(app_namespace: str) -> None:
2930
"""Set the application namespace."""
31+
if app_namespace:
32+
validate_app_namespace_name(app_namespace)
3033
global _app_namespace # pylint: disable=global-statement
3134
_app_namespace = app_namespace
3235

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
"""Tests for naming validation functionality."""
2+
3+
import pytest
4+
from cocoindex.validation import (
5+
validate_field_name,
6+
validate_flow_name,
7+
validate_full_flow_name,
8+
validate_app_namespace_name,
9+
validate_target_name,
10+
NamingError,
11+
validate_identifier_name,
12+
)
13+
14+
15+
class TestValidateIdentifierName:
16+
"""Test the core validation function."""
17+
18+
def test_valid_names(self) -> None:
19+
"""Test that valid names pass validation."""
20+
valid_names = [
21+
"field1",
22+
"field_name",
23+
"_private",
24+
"a",
25+
"field123",
26+
"FIELD_NAME",
27+
"MyField",
28+
"field_123_test",
29+
]
30+
31+
for name in valid_names:
32+
result = validate_identifier_name(name)
33+
assert result is None, f"Valid name '{name}' failed validation: {result}"
34+
35+
def test_valid_names_with_dots(self) -> None:
36+
"""Test that valid names with dots pass validation when allowed."""
37+
valid_names = ["app.flow", "my_app.my_flow", "namespace.sub.flow", "a.b.c.d"]
38+
39+
for name in valid_names:
40+
result = validate_identifier_name(name, allow_dots=True)
41+
assert result is None, (
42+
f"Valid dotted name '{name}' failed validation: {result}"
43+
)
44+
45+
def test_invalid_starting_characters(self) -> None:
46+
"""Test names with invalid starting characters."""
47+
invalid_names = [
48+
"123field", # starts with digit
49+
".field", # starts with dot
50+
"-field", # starts with dash
51+
" field", # starts with space
52+
]
53+
54+
for name in invalid_names:
55+
result = validate_identifier_name(name)
56+
assert result is not None, (
57+
f"Invalid name '{name}' should have failed validation"
58+
)
59+
60+
def test_double_underscore_restriction(self) -> None:
61+
"""Test double underscore restriction."""
62+
invalid_names = ["__reserved", "__internal", "__test"]
63+
64+
for name in invalid_names:
65+
result = validate_identifier_name(name)
66+
assert result is not None
67+
assert "double underscores" in result.lower()
68+
69+
def test_length_restriction(self) -> None:
70+
"""Test maximum length restriction."""
71+
long_name = "a" * 65
72+
result = validate_identifier_name(long_name, max_length=64)
73+
assert result is not None
74+
assert "maximum length" in result.lower()
75+
76+
77+
class TestSpecificValidators:
78+
"""Test the specific validation functions."""
79+
80+
def test_valid_field_names(self) -> None:
81+
"""Test valid field names."""
82+
valid_names = ["field1", "field_name", "_private", "FIELD"]
83+
for name in valid_names:
84+
validate_field_name(name) # Should not raise
85+
86+
def test_invalid_field_names(self) -> None:
87+
"""Test invalid field names raise NamingError."""
88+
invalid_names = ["123field", "field-name", "__reserved", "a" * 65]
89+
90+
for name in invalid_names:
91+
with pytest.raises(NamingError):
92+
validate_field_name(name)
93+
94+
def test_flow_validation(self) -> None:
95+
"""Test flow name validation."""
96+
# Valid flow names
97+
validate_flow_name("MyFlow")
98+
validate_flow_name("my_flow_123")
99+
100+
# Invalid flow names
101+
with pytest.raises(NamingError):
102+
validate_flow_name("123flow")
103+
104+
with pytest.raises(NamingError):
105+
validate_flow_name("__reserved_flow")
106+
107+
def test_full_flow_name_allows_dots(self) -> None:
108+
"""Test that full flow names allow dots."""
109+
validate_full_flow_name("app.my_flow")
110+
validate_full_flow_name("namespace.subnamespace.flow")
111+
112+
# But still reject invalid patterns
113+
with pytest.raises(NamingError):
114+
validate_full_flow_name("123.invalid")
115+
116+
def test_target_validation(self) -> None:
117+
"""Test target name validation."""
118+
validate_target_name("my_target")
119+
validate_target_name("output_table")
120+
121+
with pytest.raises(NamingError):
122+
validate_target_name("123target")
123+
124+
def test_app_namespace_validation(self) -> None:
125+
"""Test app namespace validation."""
126+
validate_app_namespace_name("myapp")
127+
validate_app_namespace_name("my_app_123")
128+
129+
# Should not allow dots in app namespace
130+
with pytest.raises(NamingError):
131+
validate_app_namespace_name("my.app")
132+
133+
with pytest.raises(NamingError):
134+
validate_app_namespace_name("123app")

python/cocoindex/validation.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
"""
2+
Naming validation for CocoIndex identifiers.
3+
4+
This module enforces naming conventions for flow names, field names,
5+
target names, and app namespace names as specified in issue #779.
6+
"""
7+
8+
import re
9+
from typing import Optional
10+
11+
_IDENTIFIER_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
12+
_IDENTIFIER_WITH_DOTS_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_.]*$")
13+
14+
15+
class NamingError(ValueError):
16+
"""Exception raised for naming convention violations."""
17+
18+
pass
19+
20+
21+
def validate_identifier_name(
22+
name: str,
23+
max_length: int = 64,
24+
allow_dots: bool = False,
25+
identifier_type: str = "identifier",
26+
) -> Optional[str]:
27+
"""
28+
Validate identifier names according to CocoIndex naming rules.
29+
30+
Args:
31+
name: The name to validate
32+
max_length: Maximum allowed length (default 64)
33+
allow_dots: Whether to allow dots in the name (for full flow names)
34+
identifier_type: Type of identifier for error messages
35+
36+
Returns:
37+
None if valid, error message string if invalid
38+
"""
39+
if not name:
40+
return f"{identifier_type} name cannot be empty"
41+
42+
if len(name) > max_length:
43+
return f"{identifier_type} name '{name}' exceeds maximum length of {max_length} characters"
44+
45+
if name.startswith("__"):
46+
return f"{identifier_type} name '{name}' cannot start with double underscores (reserved for internal usage)"
47+
48+
# Define allowed pattern
49+
if allow_dots:
50+
pattern = _IDENTIFIER_WITH_DOTS_PATTERN
51+
allowed_chars = "letters, digits, underscores, and dots"
52+
else:
53+
pattern = _IDENTIFIER_PATTERN
54+
allowed_chars = "letters, digits, and underscores"
55+
56+
if not pattern.match(name):
57+
return f"{identifier_type} name '{name}' must start with a letter or underscore and contain only {allowed_chars}"
58+
59+
return None
60+
61+
62+
def validate_field_name(name: str) -> None:
63+
"""Validate field names."""
64+
error = validate_identifier_name(
65+
name, max_length=64, allow_dots=False, identifier_type="Field"
66+
)
67+
if error:
68+
raise NamingError(error)
69+
70+
71+
def validate_flow_name(name: str) -> None:
72+
"""Validate flow names."""
73+
error = validate_identifier_name(
74+
name, max_length=64, allow_dots=False, identifier_type="Flow"
75+
)
76+
if error:
77+
raise NamingError(error)
78+
79+
80+
def validate_full_flow_name(name: str) -> None:
81+
"""Validate full flow names (can contain dots for namespacing)."""
82+
error = validate_identifier_name(
83+
name, max_length=64, allow_dots=True, identifier_type="Full flow"
84+
)
85+
if error:
86+
raise NamingError(error)
87+
88+
89+
def validate_app_namespace_name(name: str) -> None:
90+
"""Validate app namespace names."""
91+
error = validate_identifier_name(
92+
name, max_length=64, allow_dots=False, identifier_type="App namespace"
93+
)
94+
if error:
95+
raise NamingError(error)
96+
97+
98+
def validate_target_name(name: str) -> None:
99+
"""Validate target names."""
100+
error = validate_identifier_name(
101+
name, max_length=64, allow_dots=False, identifier_type="Target"
102+
)
103+
if error:
104+
raise NamingError(error)

0 commit comments

Comments
 (0)