Skip to content

Commit bdb1145

Browse files
authored
Merge pull request #48 from geoadmin/develop
New Release v3.0.1 - #patch
2 parents 864e661 + a7e7379 commit bdb1145

File tree

7 files changed

+378
-97
lines changed

7 files changed

+378
-97
lines changed

.env.testing

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ALLOWED_DOMAINS=.*\.geo\.admin\.ch,.*\.bgdi\.ch,.*\.swisstopo\.cloud
1+
ALLOWED_DOMAINS=.*\.geo\.admin\.ch,.*\.bgdi\.ch,http://localhost
22
AWS_ACCESS_KEY_ID=testing
33
AWS_SECRET_ACCESS_KEY=testing
44
AWS_SECURITY_TOKEN=testing
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
# Continuous integration platform
2+
3+
> `Status: proposed`
4+
>
5+
> `Date: 2022-05-17`
6+
>
7+
> `Author: Brice Schaffner`
8+
9+
## Context
10+
11+
The actual short ID algorithm for `service-shortlink` uses a kind of counter based on the computer time.
12+
Its takes the actual timestamp rounded to the milliseconds (millisecond timestamp from 1970.01.01 00:00:00.000), to
13+
reduce the size of this timestamp, `1'000'000'000'000` is substracted from it, which give us the number
14+
of milliseconds from `2001.09.09 03:46:40.000`.
15+
16+
### PROS of actual algorithm
17+
18+
- Very simple
19+
- With a request rate less than `1 rps` (current rate is `~0.3 rps`) collision are quite unlikely and can be easily avoided with very small number of retries.
20+
21+
### CONS of actual algorithm
22+
23+
- Size of short ID is dynamic, current is 10 characters but will increase in near future. Around `2037.01.01` we will have 11 characters.
24+
25+
## Nano ID - random short ID
26+
27+
We could reduced the size of the ID to 8 characters by using [Nano ID](https://github.com/ai/nanoid).
28+
Here however we have an issue with collision ! Based on [Nano ID collision calculator](https://zelark.github.io/nano-id-cc/)
29+
and our current request rate of ~1050 rph (request per hour), we will have a 1% collision risk in 99 days !
30+
Now looking closer to the mathematics (note I might be wrong here as I'm not a mathematician) we can
31+
compute the collision probability as follow:
32+
33+
- d := number of different possible IDs (see [Permutation with Replacement](https://www.calculatorsoup.com/calculators/discretemathematics/permutationsreplacement.php))
34+
- n := number of IDs
35+
- `1-((d-1)/d**(n*-1)/2)` [Birthday Paradox / Probability of a shared birthday (collision)](https://en.wikipedia.org/wiki/Birthday_problem)
36+
37+
```python
38+
d = 64**8
39+
print(f"{d:,}")
40+
281,474,976,710,656
41+
42+
# Number of IDs after 100 days
43+
n = 1050 * 24 * 100
44+
45+
collision = 1-((d-1)/d)**(n*(n-1)/2)
46+
print(str(int(collision * 100)) + '%')
47+
1%
48+
49+
# Number of IDs after 1 years
50+
n = 1050 * 24 * 365 * 1
51+
52+
collision = 1-((d-1)/d)**(n*(n-1)/2)
53+
print(str(int(collision * 100)) + '%')
54+
13%
55+
56+
# Number of IDs after 3 years
57+
n = 1050 * 24 * 365 * 3
58+
59+
collision = 1-((d-1)/d)**(n*(n-1)/2)
60+
print(str(int(collision * 100)) + '%')
61+
74%
62+
63+
# Number of IDs after 5 years
64+
n = 1050 * 24 * 365 * 5
65+
66+
collision = 1-((d-1)/d)**(n*(n-1)/2)
67+
print(str(int(collision * 100)) + '%')
68+
97%
69+
70+
# Number of IDs after 10 years
71+
n = 1050 * 24 * 365 * 10
72+
73+
collision = 1-((d-1)/d)**(n*(n-1)/2)
74+
print(str(int(collision * 100)) + '%')
75+
99%
76+
```
77+
78+
### Nano ID tests
79+
80+
I tested Nano ID with 1 and 2 characters with the following code
81+
82+
```python
83+
# app/helpers/utils.py
84+
def generate_short_id():
85+
return generate(size=8)
86+
87+
# tests/unit_tests/test_helpers.py
88+
class TestDynamoDb(BaseShortlinkTestCase):
89+
@params(1, 2)
90+
@patch('app.helpers.dynamo_db.generate_short_id')
91+
def test_duplicate_short_id_end_of_ids(self, m, mock_generate_short_id):
92+
regex = re.compile(r'^[0-9a-zA-Z-_]{' + str(m) + '}$')
93+
94+
def generate_short_id_mocker():
95+
return generate(size=m)
96+
97+
mock_generate_short_id.side_effect = generate_short_id_mocker
98+
# with generate(size=1) we have 64 different possible IDs, as we get closer to this number
99+
# the collision will increase. Here we make sure that we can generate at least the half
100+
# of the maximal number of unique ID with less than the max retry.
101+
n = 64
102+
max_ids = int(factorial(n) / (factorial(m) * factorial(n - m)))
103+
logger.debug('Try to generate %d entries', max_ids)
104+
for i in range(max_ids):
105+
logger.debug('-' * 80)
106+
logger.debug('Add entry %d', i)
107+
if i < max_ids / 2:
108+
109+
next_entry = add_url_to_table(
110+
f'https://www.example/test-duplicate-id-end-of-ids-{i}-url'
111+
)
112+
self.assertIsNotNone(
113+
regex.match(next_entry['shortlink_id']),
114+
msg=f"short ID {next_entry['shortlink_id']} don't match regex"
115+
)
116+
else:
117+
# more thant the half of max ids might fail due to more than COLLISION_MAX_RETRY
118+
# retries, therefore ignore those errors
119+
try:
120+
next_entry = add_url_to_table(
121+
f'https://www.example/test-duplicate-id-end-of-ids-{i}-url'
122+
)
123+
except db_table.meta.client.exceptions.ConditionalCheckFailedException:
124+
pass
125+
# Make sure that generating a 65 ID fails.
126+
with self.assertRaises(db_table.meta.client.exceptions.ConditionalCheckFailedException):
127+
add_url_to_table('https://www.example/test-duplicate-id-end-of-ids-65-url')
128+
129+
```
130+
131+
The test with 1 character passed but with 2 not ! This means that with 1 character we could generate
132+
up to half of the available IDs without having more than 10 retries. While with 2 character we could not !
133+
To note also that the formula used here to compute the maximal number of IDs was wrong and generated less
134+
IDs than the correct formula `max_ids = n**m`:
135+
136+
- `max_ids = n**m`
137+
- `max_ids = 64**1 = 64`
138+
- `max_ids = 64**2 = 4096`
139+
- `int(factorial(n) / (factorial(m) * factorial(n - m)))`
140+
- `n = 64; m = 1; int(factorial(n) / (factorial(m) * factorial(n - m))) = 64`
141+
- `n = 64; m = 2; int(factorial(n) / (factorial(m) * factorial(n - m))) = 2016`
142+
143+
### Nano ID conclusion
144+
145+
Based on the formula and computation above we have a high risk to have too many collision already
146+
after 3 years ! So this algorithm cannot be used with 8 characters.
147+
148+
## Other algorithms
149+
150+
After some research on shortlink algorithm, I found out that there are two category of algorithms:
151+
152+
1. Random ID generator (e.g. NanoID)
153+
2. Counter
154+
155+
While the first is very easy to implement, the size of the ID highly depends on the generation rates and
156+
max life of the IDs. For our use case we have a quite high generation rate and an infinite life of the IDs.
157+
This means that it is not the best algorithm.
158+
159+
However the second algorithm is more robust for our use case. Starting a counter from 0 we could reduce the ID significantly (less than 6 characters). However it would require to change the backend to have an atomic counter. With our current
160+
current implementation (k8s with DynamoDB) this is not feasible. So we would need to change the DB (maybe PSQL?)
161+
and rewrite the whole python service.
162+
163+
## Decision (to be accepted by others)
164+
165+
I think with the current algorithm which we used the past years, we are good up to 2037 where we will have one more character.
166+
This algo is quite robust, not the more effective in terms of ID length but very simple and fast.
167+
168+
Changing to a Random ID generator includes based on our generation rate and life cycle, is way too risky and brittle.
169+
170+
Changing to a real counter approach would require a lot of effort, starting from scratch.
171+
172+
So IMHO sticking to the current algorithm is the best for the moment. In future we can reduce the size of
173+
the shortlink by reducing the size of the host name wich is quite long; e.g. `s.bgdi.ch` instead of `s.geo.admin.ch`.

app/__init__.py

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
app.config.from_mapping({"TRAP_HTTP_EXCEPTIONS": True})
2626

2727

28+
def is_domain_allowed(domain):
29+
return re.match(ALLOWED_DOMAINS_PATTERN, domain) is not None
30+
31+
2832
@app.before_request
2933
# Add quick log of the routes used to all request.
3034
# Important: this should be the first before_request method, to ensure
@@ -43,13 +47,36 @@ def validate_origin():
4347
# any origin (anyone)
4448
return
4549

46-
if 'Origin' not in request.headers:
47-
logger.error('Origin header is not set')
50+
# The Origin headers is automatically set by the browser and cannot be changed by the javascript
51+
# application. Unfortunately this header is only set if the request comes from another origin.
52+
# Sec-Fetch-Site header is set to `same-origin` by most of the browser except by Safari !
53+
# The best protection would be to use the Sec-Fetch-Site and Origin header, however this is
54+
# not supported by Safari. Therefore we added a fallback to the Referer header for Safari.
55+
sec_fetch_site = request.headers.get('Sec-Fetch-Site', None)
56+
origin = request.headers.get('Origin', None)
57+
referrer = request.headers.get('Referer', None)
58+
59+
if origin is not None:
60+
if is_domain_allowed(origin):
61+
return
62+
logger.error('Origin=%s is not allowed', origin)
4863
abort(403, 'Permission denied')
49-
if not re.match(ALLOWED_DOMAINS_PATTERN, request.headers['Origin']):
50-
logger.error('Origin %s is not allowed', request.headers['Origin'])
64+
65+
if sec_fetch_site is not None:
66+
if sec_fetch_site in ['same-origin', 'same-site']:
67+
return
68+
logger.error('Sec-Fetch-Site=%s is not allowed', sec_fetch_site)
5169
abort(403, 'Permission denied')
5270

71+
if referrer is not None:
72+
if is_domain_allowed(referrer):
73+
return
74+
logger.error('Referer=%s is not allowed', referrer)
75+
abort(403, 'Permission denied')
76+
77+
logger.error('Referer and/or Origin and/or Sec-Fetch-Site headers not set')
78+
abort(403, 'Permission denied')
79+
5380

5481
@app.after_request
5582
def add_charset(response):
@@ -66,13 +93,15 @@ def add_generic_cors_header(response):
6693
if request.endpoint == 'checker':
6794
return response
6895

69-
if (
70-
'Origin' in request.headers and
71-
re.match(ALLOWED_DOMAINS_PATTERN, request.headers['Origin'])
72-
):
73-
# Don't add the allow origin if the origin is not allowed, otherwise that would give
74-
# a hint to the user on how to missused this service
75-
response.headers.set('Access-Control-Allow-Origin', request.headers['Origin'])
96+
if request.endpoint == 'get_shortlink' and get_redirect_param(ignore_errors=True):
97+
# redirect endpoint are allowed from all origins
98+
response.headers['Access-Control-Allow-Origin'] = "*"
99+
else:
100+
response.headers['Access-Control-Allow-Origin'] = request.host_url
101+
if 'Origin' in request.headers and is_domain_allowed(request.headers['Origin']):
102+
response.headers['Access-Control-Allow-Origin'] = request.headers['Origin']
103+
response.headers['Vary'] = 'Origin'
104+
76105
# Always add the allowed methods.
77106
response.headers.set(
78107
'Access-Control-Allow-Methods', ', '.join(get_registered_method(app, request.url_rule))

app/helpers/utils.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,13 @@ def get_registered_method(app, url_rule):
5959
)
6060

6161

62-
def get_redirect_param():
62+
def get_redirect_param(ignore_errors=False):
6363
try:
6464
redirect = strtobool(request.args.get('redirect', 'true'))
6565
except ValueError as error:
66-
abort(400, f'Invalid "redirect" arg: {error}')
66+
redirect = False
67+
if not ignore_errors:
68+
abort(400, f'Invalid "redirect" arg: {error}')
6769
return redirect
6870

6971

tests/unit_tests/base.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,18 @@ def setUp(self):
8383
def tearDown(self):
8484
self.table.delete()
8585

86-
def assertCors(self, response, expected_allowed_methods, check_origin=True): # pylint: disable=invalid-name
87-
if check_origin:
88-
self.assertIn('Access-Control-Allow-Origin', response.headers)
89-
self.assertTrue(
90-
re.match(ALLOWED_DOMAINS_PATTERN, response.headers['Access-Control-Allow-Origin'])
91-
)
86+
def assertCors(
87+
self,
88+
response,
89+
expected_allowed_methods,
90+
origin_pattern=ALLOWED_DOMAINS_PATTERN
91+
): # pylint: disable=invalid-name
92+
self.assertIn('Access-Control-Allow-Origin', response.headers)
93+
self.assertIsNotNone(
94+
re.match(origin_pattern, response.headers['Access-Control-Allow-Origin']),
95+
msg=f"Access-Control-Allow-Origin={response.headers['Access-Control-Allow-Origin']}"
96+
f" doesn't match {origin_pattern}"
97+
)
9298
self.assertIn('Access-Control-Allow-Methods', response.headers)
9399
self.assertListEqual(
94100
sorted(expected_allowed_methods),

tests/unit_tests/test_helpers.py

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -116,43 +116,3 @@ def test_one_duplicate_short_id(self, mock_generate_short_id):
116116
self.assertEqual(entry1['shortlink_id'], '2')
117117
entry2 = self.db.add_url_to_table(url2)
118118
self.assertEqual(entry2['shortlink_id'], '3')
119-
120-
# @params(1, 2)
121-
# @patch('app.helpers.dynamo_db.generate_short_id')
122-
# def test_duplicate_short_id_end_of_ids(self, m, mock_generate_short_id):
123-
# regex = re.compile(r'^[0-9a-zA-Z-_]{' + str(m) + '}$')
124-
125-
# def generate_short_id_mocker():
126-
# return generate(size=m)
127-
128-
# mock_generate_short_id.side_effect = generate_short_id_mocker
129-
# # with generate(size=1) we have 64 different possible IDs, as we get closer to this number
130-
# # the collision will increase. Here we make sure that we can generate at least the half
131-
# # of the maximal number of unique ID with less than the max retry.
132-
# n = 64
133-
# max_ids = int(factorial(n) / (factorial(m) * factorial(n - m)))
134-
# logger.debug('Try to generate %d entries', max_ids)
135-
# for i in range(max_ids):
136-
# logger.debug('-' * 80)
137-
# logger.debug('Add entry %d', i)
138-
# if i < max_ids / 2:
139-
140-
# next_entry = add_url_to_table(
141-
# f'https://www.example/test-duplicate-id-end-of-ids-{i}-url'
142-
# )
143-
# self.assertIsNotNone(
144-
# regex.match(next_entry['shortlink_id']),
145-
# msg=f"short ID {next_entry['shortlink_id']} don't match regex"
146-
# )
147-
# else:
148-
# # more thant the half of max ids might fail due to more than COLLISION_MAX_RETRY
149-
# # retries, therefore ignore those errors
150-
# try:
151-
# next_entry = add_url_to_table(
152-
# f'https://www.example/test-duplicate-id-end-of-ids-{i}-url'
153-
# )
154-
# except db_table.meta.client.exceptions.ConditionalCheckFailedException:
155-
# pass
156-
# # Make sure that generating a 65 ID fails.
157-
# with self.assertRaises(db_table.meta.client.exceptions.ConditionalCheckFailedException):
158-
# add_url_to_table('https://www.example/test-duplicate-id-end-of-ids-65-url')

0 commit comments

Comments
 (0)