Skip to content

Commit f3767b4

Browse files
author
pmasl
committed
Added Unicode notebook samples
1 parent 9f8b0f1 commit f3767b4

File tree

11 files changed

+165803
-0
lines changed

11 files changed

+165803
-0
lines changed

samples/features/sql2019notebooks/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ The [What's New](https://docs.microsoft.com/sql/sql-server/what-s-new-in-sql-ser
2626
* **[Basic_ADR.ipynb](https://github.com/microsoft/sql-server-samples/blob/master/samples/features/accelerated-database-recovery/basic_adr.ipynb)** - In this notebook, you will see how fast long-running transaction rollback can now be with Accelerated Database Recovery. You will also see that a long active transaction does not affect the ability to truncate the transaction log.
2727
* **[Recovery_ADR.ipynb](https://github.com/microsoft/sql-server-samples/blob/master/samples/features/accelerated-database-recovery/recovery_adr.ipynb)** - In this example, you will see how Accelerated Database Recovery will speed up recovery.
2828

29+
### Unicode Support (UTF-8 and UTF-16)
30+
* **[DataType_WesternMyth.ipynb](https://github.com/microsoft/sql-server-samples/blob/master/samples/features/unicode/notebooks/DataType_WesternMyth.ipynb)** - In this notebook, you will see proof that the integer that defines the length of string types (CHAR/VARCHAR/NCHAR/NVARCHAR) does not mean "number of characters" but "number of byte sto store", debunking a common misconception in SQL Server.
31+
* **[Functional.ipynb](https://github.com/microsoft/sql-server-samples/blob/master/samples/features/unicode/notebooks/Functional.ipynb)** - In this notebook, you will see how to use UTF-8 in your database or columns.
32+
* **[Storage.ipynb](https://github.com/microsoft/sql-server-samples/blob/master/samples/features/unicode/notebooks/Storage.ipynb)** - In this notebook, you will see how to the storage footprint differences are expressive between Unicode encoded in UTF-8 and UTF-16.
33+
* **[Perf_Latin.ipynb](https://github.com/microsoft/sql-server-samples/blob/master/samples/features/unicode/notebooks/Perf_Latin.ipynb)** - In this notebook, you will see the performance differences of using string data encoded in UTF-8 and UTF-16 using Latin data.
34+
* **[Perf_Non-Latin.ipynb](https://github.com/microsoft/sql-server-samples/blob/master/samples/features/unicode/notebooks/Perf_Non-Latin.ipynb)** - In this notebook, you will see the performance differences of using string data encoded in UTF-8 and UTF-16 using non-Latin data.
35+
2936
### SQL Server 2019 Querying 1 TRILLION rows
3037
* **[OneTrillionRowsWarm.ipynb](https://github.com/microsoft/sql-server-samples/blob/master/samples/features/sql2019notebooks/OneTrillionRowsWarm.ipynb)** - This notebook shows how SQL Server 2019 reads **9 BILLION rows/second** using a 1 trillion row table using a warm cache,
3138
* **[OneTrillionRowsCold.ipynb](https://github.com/microsoft/sql-server-samples/blob/master/samples/features/sql2019notebooks/OneTrillionRowsCold.ipynb)** - This notebook shows how SQL Server 2019 performs IO at **~24GB/s** using a 1 trillion row table with a cold cache.
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
----------------------------------------------
2+
-- Data type sizes - a western myth
3+
----------------------------------------------
4+
5+
-- Note: my server default is SQL_Latin1_General_CP1_CI_AS
6+
7+
-- Test Latin character strings with Latin collation
8+
-- Set size limit of data types to be the same under Basic Multilingual Plane (BMP)
9+
-- Characters between 1-byte (ASCII) and 3-bytes (East Asian)
10+
11+
DROP TABLE IF EXISTS t1;
12+
CREATE TABLE t1 (c1 varchar(24) COLLATE Latin1_General_100_CI_AI,
13+
c2 nvarchar(8) COLLATE Latin1_General_100_CI_AI);
14+
INSERT INTO t1 VALUES (N'MyString', N'MyString')
15+
SELECT LEN(c1) AS [varchar LEN],
16+
DATALENGTH(c1) AS [varchar DATALENGTH], c1
17+
FROM t1;
18+
SELECT LEN(c2) AS [nvarchar LEN],
19+
DATALENGTH(c2) AS [nvarchar DATALENGTH], c2
20+
FROM t1;
21+
GO
22+
23+
24+
25+
26+
27+
-- That's as expected. So what was I talking about?
28+
29+
30+
31+
32+
33+
-- Test Chinese character strings with Latin collation
34+
DROP TABLE IF EXISTS t1;
35+
CREATE TABLE t1 (c1 varchar(24) COLLATE Latin1_General_100_CI_AI,
36+
c2 nvarchar(8) COLLATE Latin1_General_100_CI_AI);
37+
INSERT INTO t1 VALUES (N'敏捷的棕色狐狸跳', N'敏捷的棕色狐狸跳')
38+
SELECT LEN(c1) AS [varchar LEN],
39+
DATALENGTH(c1) AS [varchar DATALENGTH], c1
40+
FROM t1;
41+
SELECT LEN(c2) AS [nvarchar LEN],
42+
DATALENGTH(c2) AS [nvarchar DATALENGTH], c2
43+
FROM t1;
44+
GO
45+
46+
47+
48+
-- uh-oh data loss on the varchar example. Why?
49+
-- varchar is bound to code page enconding, and these code points cannot be found in the Latin code page.
50+
SELECT ASCII('敏' COLLATE Latin1_General_100_CI_AI), CHAR(63)
51+
SELECT ASCII('捷' COLLATE Latin1_General_100_CI_AI), CHAR(63)
52+
53+
54+
55+
56+
57+
58+
-- But why didn't it happen in the nvarchar example?
59+
-- These Chinese characters are double-byte and within the Basic Multilingual Plane (BMP)
60+
-- nvarchar with this non-SC collation encodes in UCS-2 (BMP), not the code page
61+
SELECT UNICODE(N'敏' COLLATE Latin1_General_100_CI_AI), NCHAR(25935)
62+
SELECT UNICODE(N'捷' COLLATE Latin1_General_100_CI_AI), NCHAR(25463)
63+
64+
65+
66+
67+
-- Irrespective of collation now. With a Unicode capable data type,
68+
-- collation only sets linguistic algorithms
69+
-- (Compare = sort; Case sensitivity = Upper/Lowercase)
70+
SELECT UNICODE(N'敏' COLLATE Chinese_Traditional_Stroke_Order_100_CI_AI), NCHAR(25935)
71+
SELECT UNICODE(N'捷' COLLATE Chinese_Traditional_Stroke_Order_100_CI_AI), NCHAR(25463)
72+
73+
74+
75+
-- Now test Chinese character strings with Chinese collation
76+
DROP TABLE IF EXISTS t2;
77+
CREATE TABLE t2 (c1 varchar(24) COLLATE Chinese_Traditional_Stroke_Order_100_CI_AI,
78+
c2 nvarchar(8) COLLATE Chinese_Traditional_Stroke_Order_100_CI_AI);
79+
INSERT INTO t2 VALUES (N'敏捷的棕色狐狸跳', N'敏捷的棕色狐狸跳')
80+
SELECT LEN(c1) AS [varchar LEN],
81+
DATALENGTH(c1) AS [varchar DATALENGTH], c1
82+
FROM t2;
83+
SELECT LEN(c2) AS [nvarchar LEN],
84+
DATALENGTH(c2) AS [nvarchar DATALENGTH], c2
85+
FROM t2;
86+
GO
87+
88+
89+
-- Now the varchar example is correct. But there's 2 bytes per character?...
90+
-- Myth buster: code page defines string length for varchar. It's not always 1 byte per character.
91+
-- Wasn't East-Asian 3 bytes? Yes, but under Chinese collation code page,
92+
-- they are encoded using 2 bytes just like UCS-2/UTF-16
93+
94+
95+
96+
-- Test with Supplementary Characters (4 bytes) and using SC
97+
DROP TABLE IF EXISTS t2;
98+
CREATE TABLE t2 (c1 varchar(24) COLLATE Chinese_Traditional_Stroke_Order_100_CI_AI_SC,
99+
c2 nvarchar(8) COLLATE Chinese_Traditional_Stroke_Order_100_CI_AI_SC);
100+
INSERT INTO t2 VALUES (N'👶👦👧👨👩👴👵👨', N'👶👦👧👨👩👴👵👨')
101+
SELECT LEN(c1) AS [varchar LEN],
102+
DATALENGTH(c1) AS [varchar DATALENGTH], c1
103+
FROM t2;
104+
SELECT LEN(c2) AS [nvarchar LEN],
105+
DATALENGTH(c2) AS [nvarchar DATALENGTH], c2
106+
FROM t2;
107+
GO
108+
109+
110+
111+
-- Fix the error
112+
DROP TABLE IF EXISTS t2;
113+
CREATE TABLE t2 (c1 varchar(24) COLLATE Chinese_Traditional_Stroke_Order_100_CI_AI_SC,
114+
c2 nvarchar(16) COLLATE Chinese_Traditional_Stroke_Order_100_CI_AI_SC);
115+
INSERT INTO t2 VALUES (N'👶👦👧👨👩👴👵👨', N'👶👦👧👨👩👴👵👨')
116+
SELECT LEN(c1) AS [varchar LEN],
117+
DATALENGTH(c1) AS [varchar DATALENGTH], c1
118+
FROM t2;
119+
SELECT LEN(c2) AS [nvarchar LEN],
120+
DATALENGTH(c2) AS [nvarchar DATALENGTH], c2
121+
FROM t2;
122+
GO
123+
124+
125+
-- Varchar still doesn't encode?
126+
DROP TABLE IF EXISTS t2;
127+
CREATE TABLE t2 (c1 varchar(48) COLLATE Chinese_Traditional_Stroke_Order_100_CI_AI_SC_UTF8,
128+
c2 nvarchar(16) COLLATE Chinese_Traditional_Stroke_Order_100_CI_AI_SC);
129+
INSERT INTO t2 VALUES (N'👶👦👧👨👩👴👵👨', N'👶👦👧👨👩👴👵👨')
130+
SELECT LEN(c1) AS [varchar LEN],
131+
DATALENGTH(c1) AS [varchar DATALENGTH], c1
132+
FROM t2;
133+
SELECT LEN(c2) AS [nvarchar LEN],
134+
DATALENGTH(c2) AS [nvarchar DATALENGTH], c2
135+
FROM t2;
136+
GO
137+
138+
139+
140+
141+
-- What if I needed all these in one database? Easy, I could just use nvarchar.
142+
DROP TABLE IF EXISTS t3;
143+
CREATE TABLE t3 (c1 nvarchar(110) COLLATE Latin1_General_100_CI_AI_SC);
144+
INSERT INTO t3 VALUES (N'MyStringThequickbrownfoxjumpsoverthelazydogIsLatinAscii敏捷的棕色狐狸跳👶👦')
145+
SELECT LEN(c1) AS [nvarchar UTF16 LEN],
146+
DATALENGTH(c1) AS [nvarchar UTF16 DATALENGTH], c1
147+
FROM t3;
148+
GO
149+
150+
151+
152+
153+
-- But the majority of my data is set to Latin (ASCII)
154+
DROP TABLE IF EXISTS t4;
155+
CREATE TABLE t4 (c1 varchar(110) COLLATE Latin1_General_100_CI_AI_SC);
156+
INSERT INTO t4 VALUES (N'MyStringThequickbrownfoxjumpsoverthelazydogIsLatinAscii敏捷的棕色狐狸跳👶👦')
157+
SELECT LEN(c1) AS [varchar UTF16 LEN],
158+
DATALENGTH(c1) AS [varchar UTF16 DATALENGTH], c1
159+
FROM t4;
160+
GO
161+
162+
163+
164+
-- Where are the savings?
165+
SELECT DATALENGTH(N'MyStringThequickbrownfoxjumpsoverthelazydogIsLatinAscii') AS [Latin_UTF16_2bytes],
166+
DATALENGTH(N'敏捷的棕色狐狸跳') AS [Chinese_UTF16_2bytes],
167+
DATALENGTH(N'👶👦') AS [SC_UTF16_4bytes]
168+
SELECT DATALENGTH('MyStringThequickbrownfoxjumpsoverthelazydogIsLatinAscii' COLLATE Latin1_General_100_CI_AI_SC_UTF8) AS [Latin_UTF8_1byte],
169+
DATALENGTH('敏捷的棕色狐狸跳' COLLATE Latin1_General_100_CI_AI_SC_UTF8) AS [Chinese_UTF8_3bytes],
170+
DATALENGTH('👶👦' COLLATE Latin1_General_100_CI_AI_SC_UTF8) AS [SC_UTF8_4bytes]
171+
GO
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
----------------------------------------------
2+
-- UTF-8 Functional
3+
----------------------------------------------
4+
5+
USE master;
6+
GO
7+
8+
DROP DATABASE IF EXISTS MyNonUtf8Database;
9+
10+
DROP DATABASE IF EXISTS MyUtf8Database;
11+
12+
DROP DATABASE IF EXISTS MyFormerlyUnicodeOnlyDatabase;
13+
14+
DROP DATABASE IF EXISTS MaskingDatabase;
15+
16+
--
17+
-- Create a database that is NOT collated with UTF-8.
18+
-- This demonstrates that you can insert Unicode data into VARCHAR columns collated with UTF-8.
19+
--
20+
CREATE DATABASE MyNonUtf8Database COLLATE SQL_Latin1_General_CP1_CI_AI;
21+
GO
22+
23+
USE MyNonUtf8Database;
24+
GO
25+
26+
CREATE TABLE MyUtf8Table (datakind VARCHAR(100), data VARCHAR(8000) COLLATE Latin1_General_100_CI_AS_SC_UTF8);
27+
GO
28+
29+
INSERT INTO MyUtf8Table
30+
VALUES ('ASCII - 1 byte per character', N'Thequickbrownfoxjumpsoverthelazydog'),
31+
('Cyrillic - 2 bytes per character', N'Быстраякоричневаялисапрыгаетчерезленивуюсобаку'),
32+
('Far East - 3 bytes per character', N'敏捷的棕色狐狸跳过了懒狗'),
33+
('Emojis - 4 bytes per character', N'👶👦👧👨👩👴👵👨👩👨👩👨👩👨👩'),
34+
('Emojis with Variation Selector - 6 bytes per glyph', N'⚕️⚖️↔︎↕︎↖︎↗︎↘︎↙︎↩︎↪︎↔️↕️↖️↗️↘️↙️↩️↪️'),
35+
('Ashi with Supplementary Variation Selector - 7 bytes per glyph', N'芦󠄀芦󠄁芦󠄂芦󠄃芦󠄄芦󠄅芦󠄆芦󠄇芦󠄈芦󠄉芦󠄃芦󠄂芦󠄁芦󠄀芦󠄁芦󠄂芦󠄃芦󠄄芦󠄈芦󠄉');
36+
GO
37+
38+
SELECT datakind, data
39+
FROM MyUtf8Table;
40+
GO
41+
42+
-- This demo used the N' syntax, as string literals are always collated in the collation
43+
-- of the currently active database.
44+
45+
46+
47+
--
48+
-- Create a database collated with UTF-8.
49+
-- This is to demonstrate that now string literals can be used without N'',
50+
-- as string literals are collated with the database collation, and can hold any characters.
51+
--
52+
CREATE DATABASE MyUtf8Database COLLATE Lithuanian_100_CS_AI_WS_SC_UTF8;
53+
GO
54+
55+
USE MyUtf8Database;
56+
GO
57+
58+
CREATE TABLE MyTableWithInheritedCollation (datakind VARCHAR(100), data VARCHAR(8000));
59+
GO
60+
61+
INSERT INTO MyTableWithInheritedCollation
62+
VALUES ('ASCII - 1 byte per character', 'Thequickbrownfoxjumpsoverthelazydog'),
63+
('Cyrillic - 2 bytes per character', 'Быстраякоричневаялисапрыгаетчерезленивуюсобаку'),
64+
('Far East - 3 bytes per character', '敏捷的棕色狐狸跳过了懒狗'),
65+
('Emojis - 4 bytes per character', '👶👦👧👨👩👴👵👨👩👨👩👨👩👨👩'),
66+
('Emojis with Variation Selector - 6 bytes per glyph', '⚕️⚖️↔︎↕︎↖︎↗︎↘︎↙︎↩︎↪︎↔️↕️↖️↗️↘️↙️↩️↪️'),
67+
('Ashi with Supplementary Variation Selector - 7 bytes per glyph', '芦󠄀芦󠄁芦󠄂芦󠄃芦󠄄芦󠄅芦󠄆芦󠄇芦󠄈芦󠄉芦󠄃芦󠄂芦󠄁芦󠄀芦󠄁芦󠄂芦󠄃芦󠄄芦󠄈芦󠄉');
68+
GO
69+
70+
SELECT datakind, data
71+
FROM MyTableWithInheritedCollation;
72+
GO
73+
74+
--
75+
-- Create a collation prefixed with formerly Unicode-only collation (not having its own Windows code page).
76+
-- You can do it now.
77+
--
78+
CREATE DATABASE GonnaFailDueToUnicodeOnlyCollation COLLATE Lao_100_CS_AS_KS_WS_SC;
79+
GO
80+
81+
CREATE DATABASE MyFormerlyUnicodeOnlyDatabase COLLATE Lao_100_CS_AS_KS_WS_SC_UTF8;
82+
GO
83+
84+
USE MyFormerlyUnicodeOnlyDatabase;
85+
GO
86+
87+
CREATE TABLE MyFormerlyUnicodeOnlyTable (datakind VARCHAR(100), data VARCHAR(8000));
88+
GO
89+
90+
INSERT INTO MyFormerlyUnicodeOnlyTable (datakind, data)
91+
SELECT datakind, data
92+
FROM MyNonUtf8Database..MyUtf8Table;
93+
GO
94+
95+
SELECT datakind, data
96+
FROM MyFormerlyUnicodeOnlyTable;
97+
GO
98+
99+
100+
101+
102+
103+
104+
105+
106+
107+
--
108+
-- Demo of one orthogonality feature - data masking
109+
--
110+
CREATE DATABASE MaskingDatabase COLLATE Chinese_PRC_90_CI_AI_SC_UTF8;
111+
GO
112+
113+
USE MaskingDatabase;
114+
GO
115+
116+
CREATE user ToBeKeptAway without LOGIN;
117+
GO
118+
119+
CREATE TABLE KeepAway (top_secret_data VARCHAR(8000) COLLATE Mapudungan_100_CS_AS_SC_UTF8 masked
120+
WITH (FUNCTION = 'partial(2, "💩💩💩💩💩", 2)'));
121+
GO
122+
123+
INSERT INTO KeepAway (top_secret_data)
124+
SELECT data
125+
FROM MyNonUtf8Database..MyUtf8Table;
126+
GO
127+
128+
GRANT SELECT
129+
ON KeepAway
130+
TO ToBeKeptAway;
131+
GO
132+
133+
EXECUTE AS user = 'ToBeKeptAway';
134+
135+
SELECT top_secret_data
136+
FROM KeepAway;
137+
138+
REVERT;
139+
GO
140+
141+
----------------------------
142+
/*
143+
See how many bytes each character requires for both UTF-8 and UTF-16 encodings.
144+
Returns all 65,536 BMP (Base Multilingual Plan) characters (which is also the entire UCS-2 character set), and 3 Supplementary Characters.
145+
Since all Supplementary Characters are 4 bytes in both encodings, there is no need to return more of them, but we do need to see a few of them to see that they are:
146+
a) all 4 bytes
147+
b) encoded slightly differently
148+
*/
149+
;
150+
151+
WITH nums ([CodePoint])
152+
AS (
153+
SELECT TOP (65536) (
154+
ROW_NUMBER() OVER (
155+
ORDER BY (
156+
SELECT 0
157+
)
158+
) - 1
159+
)
160+
FROM [master].[sys].[columns] col
161+
CROSS JOIN [master].[sys].[objects] obj
162+
), chars
163+
AS (
164+
SELECT nums.[CodePoint], CONVERT(VARCHAR(4), NCHAR(nums.[CodePoint]) COLLATE Latin1_General_100_CI_AS_SC_UTF8) AS [TheChar], CONVERT(VARBINARY(4), CONVERT(VARCHAR(4), NCHAR(nums.[CodePoint]) COLLATE Latin1_General_100_CI_AS_SC_UTF8)) AS [UTF8]
165+
FROM nums
166+
167+
UNION ALL
168+
169+
SELECT tmp.val, CONVERT(VARCHAR(4), CONVERT(NVARCHAR(5), tmp.hex) COLLATE Latin1_General_100_CI_AS_SC_UTF8) AS [TheChar], CONVERT(VARBINARY(4), CONVERT(VARCHAR(4), CONVERT(NVARCHAR(5), tmp.hex) COLLATE Latin1_General_100_CI_AS_SC_UTF8)) AS [UTF8]
170+
FROM (
171+
VALUES (65536, 0x00D800DC), -- Linear B Syllable B008 A (U+10000)
172+
(67618, 0x02D822DC), -- Cypriot Syllable Pu (U+10822)
173+
(129384, 0x3ED868DD) -- Pretzel (U+1F968)
174+
) tmp(val, hex)
175+
)
176+
SELECT chr.[CodePoint], COALESCE(chr.[TheChar], N'TOTALS:') AS [Character], chr.[UTF8] AS [UTF8_Hex], DATALENGTH(chr.[UTF8]) AS [UTF8_Bytes], COUNT(CASE DATALENGTH(chr.[UTF8]) WHEN 1 THEN 'x' END) AS [1-byte], COUNT(CASE DATALENGTH(chr.[UTF8]) WHEN 2 THEN 'x' END) AS [2-bytes], COUNT(CASE DATALENGTH(chr.[UTF8]) WHEN 3 THEN 'x' END) AS [3-bytes], COUNT(CASE DATALENGTH(chr.[UTF8]) WHEN 4 THEN 'x' END) AS [4-bytes],
177+
---
178+
CONVERT(VARBINARY(4), CONVERT(NVARCHAR(3), chr.[TheChar])) AS [UTF16(LE)_Hex], DATALENGTH(CONVERT(NVARCHAR(3), chr.[TheChar])) AS [UTF16_Bytes],
179+
---
180+
((DATALENGTH(CONVERT(NVARCHAR(3), chr.[TheChar]))) - (DATALENGTH(chr.[TheChar]))) AS [UTF8savingsOverUTF16]
181+
FROM chars chr
182+
GROUP BY ROLLUP((chr.[CodePoint], chr.[TheChar], chr.[UTF8]));

0 commit comments

Comments
 (0)