Skip to content

Commit 67a8e22

Browse files
committed
Add SQL Greek-to-BetaCode letters-only function
Enable database-side transliteration of Greek text into pure lowercase Beta Code letters by normalizing to NFD, mapping base letters, and skipping diacritics or non-letter characters for consistent indexing and matching.
1 parent 5206d74 commit 67a8e22

File tree

1 file changed

+91
-0
lines changed

1 file changed

+91
-0
lines changed

sql/12_functions.sql

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,3 +540,94 @@ BEGIN
540540

541541
END;
542542
$BODY$;
543+
544+
-- convert greek text to beta code
545+
DROP FUNCTION IF EXISTS public.greek_to_betacode(text);
546+
CREATE OR REPLACE FUNCTION public.greek_to_betacode(p_input text)
547+
RETURNS text
548+
LANGUAGE 'plpgsql'
549+
COST 100
550+
IMMUTABLE PARALLEL UNSAFE
551+
AS $BODY$
552+
DECLARE
553+
s text;
554+
result text := '';
555+
i integer := 1;
556+
len integer;
557+
ch text;
558+
lower_ch text;
559+
letter text;
560+
BEGIN
561+
IF p_input IS NULL THEN
562+
RETURN NULL;
563+
END IF;
564+
565+
-- Normalize to NFD: base letter + combining diacritics
566+
s := normalize(p_input, NFD);
567+
len := char_length(s);
568+
569+
WHILE i <= len LOOP
570+
ch := substr(s, i, 1);
571+
lower_ch := lower(ch);
572+
letter := NULL;
573+
574+
CASE
575+
-- combining diacritics → produce nothing
576+
WHEN ch IN (
577+
U&'\0313', -- COMBINING COMMA ABOVE (smooth breathing)
578+
U&'\0314', -- COMBINING REVERSED COMMA ABOVE (rough breathing)
579+
U&'\0301', -- COMBINING ACUTE
580+
U&'\0300', -- COMBINING GRAVE
581+
U&'\0342', -- COMBINING PERISPOMENI (circumflex)
582+
U&'\0302', -- COMBINING CIRCUMFLEX
583+
U&'\0308', -- COMBINING DIAERESIS
584+
U&'\0345', -- COMBINING IOTA SUBSCRIPT
585+
U&'\0304', -- COMBINING MACRON
586+
U&'\0306' -- COMBINING BREVE
587+
) THEN
588+
letter := '' ; -- skip
589+
590+
-- Greek base letters → beta code a–z
591+
WHEN lower_ch = 'α' THEN letter := 'a';
592+
WHEN lower_ch = 'β' THEN letter := 'b';
593+
WHEN lower_ch = 'γ' THEN letter := 'g';
594+
WHEN lower_ch = 'δ' THEN letter := 'd';
595+
WHEN lower_ch = 'ε' THEN letter := 'e';
596+
WHEN lower_ch = 'ζ' THEN letter := 'z';
597+
WHEN lower_ch = 'η' THEN letter := 'h';
598+
WHEN lower_ch = 'θ' THEN letter := 'q';
599+
WHEN lower_ch = 'ι' THEN letter := 'i';
600+
WHEN lower_ch = 'κ' THEN letter := 'k';
601+
WHEN lower_ch = 'λ' THEN letter := 'l';
602+
WHEN lower_ch = 'μ' THEN letter := 'm';
603+
WHEN lower_ch = 'ν' THEN letter := 'n';
604+
WHEN lower_ch = 'ξ' THEN letter := 'c';
605+
WHEN lower_ch = 'ο' THEN letter := 'o';
606+
WHEN lower_ch = 'π' THEN letter := 'p';
607+
WHEN lower_ch = 'ρ' THEN letter := 'r';
608+
WHEN lower_ch = 'σ' THEN letter := 's';
609+
WHEN lower_ch = 'ς' THEN letter := 's'; -- final sigma
610+
WHEN lower_ch = 'τ' THEN letter := 't';
611+
WHEN lower_ch = 'υ' THEN letter := 'u';
612+
WHEN lower_ch = 'φ' THEN letter := 'f';
613+
WHEN lower_ch = 'χ' THEN letter := 'x';
614+
WHEN lower_ch = 'ψ' THEN letter := 'y';
615+
WHEN lower_ch = 'ω' THEN letter := 'w';
616+
WHEN lower_ch = 'ϝ' THEN letter := 'v'; -- digamma (optional)
617+
618+
-- everything else → space
619+
ELSE
620+
letter := ' ';
621+
END CASE;
622+
623+
-- append only when letter is non-empty (skip diacritics)
624+
IF letter IS NOT NULL AND letter <> '' THEN
625+
result := result || letter;
626+
END IF;
627+
628+
i := i + 1;
629+
END LOOP;
630+
631+
RETURN result;
632+
END;
633+
$BODY$;

0 commit comments

Comments
 (0)