Skip to content

Commit 0ee2377

Browse files
authored
Merge pull request #370 from inariksit/morphodict
Minor improvements in MkMorphodict.hs + Finnish lexicon in new format
2 parents ace36f7 + e3c6694 commit 0ee2377

File tree

11 files changed

+61144
-62140
lines changed

11 files changed

+61144
-62140
lines changed

src/finnish/NewDictFin.gf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19885,7 +19885,7 @@ lin moniaalle_Adv = mkAdv {s = c99 "moniaalle"} ;
1988519885
lin moniaalta_Adv = mkAdv {s = c99 "moniaalta"} ;
1988619886
lin monialaistua_V = mkV {s = c52 "monialaistua"} ;
1988719887
lin monias_N = mkN {s = d41 "monias"} ;
19888-
lin moni_ilmeinen_N = mkN {s = d18 "moni-ilmeinen"} ;
19888+
lin moni_ilmeinen_N = mkN {s = d38 "moni-ilmeinen"} ;
1988919889
lin monijumalaisuus_N = mkN {s = d40 "monijumalaisuus"} ;
1989019890
lin monikansainen_N = mkN {s = d38 "monikansainen"} ;
1989119891
lin monikko_N = mkN {s = d04A "monikko"} ;
@@ -41387,7 +41387,7 @@ lin ykkönen_N = mkN {s = d38 "ykkönen"} ;
4138741387
lin yks_Adv = mkAdv {s = c99 "yks"} ;
4138841388
lin ykseys_N = mkN {s = d40 "ykseys"} ;
4138941389
lin yksi_N = mkN {s = d31 "yksi"} ;
41390-
lin yksi_ilmeinen_N = mkN {s = d18 "yksi-ilmeinen"} ;
41390+
lin yksi_ilmeinen_N = mkN {s = d38 "yksi-ilmeinen"} ;
4139141391
lin yksiin_Adv = mkAdv {s = c99 "yksiin"} ;
4139241392
lin yksijumalaisuus_N = mkN {s = d40 "yksijumalaisuus"} ;
4139341393
lin yksikkö_N = mkN {s = d04A "yksikkö"} ;

src/morphodict/MkMorphodict.hs

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ import PGF
55
import qualified Data.Map as M
66
import Data.Char
77
import Data.List
8+
import Safe
89
import System.Environment (getArgs)
10+
import Debug.Trace
911

1012
-- AR 2020-02-28
1113

@@ -28,29 +30,33 @@ usage = "runghc MkMorphodict (raw|pgf) <configfile> <datafile> <outfile>"
2830
main = do
2931
xx <- getArgs
3032
if length xx /= 4
31-
then putStrLn usage
33+
then do
34+
putStrLn "Usage:"
35+
putStrLn usage
36+
putStrLn $ "Got instead: " ++ show xx
3237
else do
33-
let mode:configfile:datafile:outfile:_ = xx
38+
let mode:configfile:datafile:outfile:_ = xx
3439
config <- readFile configfile >>= return . mkConfig
35-
40+
3641
rawdata <- case mode of
3742
"pgf" -> pgfFile2rawData config datafile
38-
"raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines
43+
"raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines
44+
_ -> error $ "Expected mode (pgf|raw), got " ++ mode
3945
rawdata2gf config rawdata outfile
4046

4147

4248
rawdata2gf config rawdata outfile = do
43-
49+
4450
let env = MDEnv rawdata config
4551
let (absrules,cncrules) = mkMorphoDict env
46-
52+
4753
absheader <- readFile (outfile ++ "Abs.header")
4854
cncheader <- readFile (outfile ++ ".header")
49-
55+
5056
writeFile (outfile ++ "Abs.gf") absheader
5157
appendFile (outfile ++ "Abs.gf") $ unlines $ sort absrules
5258
appendFile (outfile ++ "Abs.gf") "}"
53-
59+
5460
writeFile (outfile ++ ".gf") cncheader
5561
appendFile (outfile ++ ".gf") $ unlines $ sort cncrules
5662
appendFile (outfile ++ ".gf") "}"
@@ -68,7 +74,7 @@ pgfFile2rawData config pgffile = do
6874
cat <- cats,
6975
f <- functionsByCat pgf (mkCId cat),
7076
lin <- tabularLinearizes pgf lang (mkApp f [])
71-
]
77+
]
7278

7379
type Cat = String
7480
type Fun = String
@@ -84,12 +90,13 @@ mkConfig :: String -> Config -- N : N mkN 0 2 4 6 # 9
8490
mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
8591
where
8692
mkOne s = case words s of
87-
"--":_ -> Right s
93+
"--":_ -> Right s
8894
cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints))
8995
_ -> Right s
9096
mkArgs ints = case break (=="#") ints of
91-
(ss,[]) -> (map read ss, [])
92-
(ss,_:fs) -> (map read ss, map read fs)
97+
(ss,[]) -> (map read' ss, [])
98+
(ss,_:fs) -> (map read' ss, map read' fs)
99+
read' a = readNote [] a -- Safe.readNote provides better error message
93100

94101
getRawData s = case words s of
95102
c:cs -> (c,cs)
@@ -118,10 +125,13 @@ mkMorphoDict env =
118125
(([lemma],newcat),(oper, appSig sig args)) |
119126
(oldcat,args) <- raws,
120127
Just (newcat, oper, sig) <- [M.lookup oldcat (config env)],
121-
let lemma = args !! head (fst sig)
122-
]
128+
let lemma = args `at` head (fst sig)
129+
]
123130

124-
appSig (ints,feats) args = ([args !! i | i <- ints], [args !! i | i <- feats])
131+
appSig (ints,feats) args =
132+
-- If there's wrong number in config file, uncomment the line below to see which number it should be
133+
-- trace (intercalate "\n" $ map show (zip [0..] args)) $
134+
([args `at` i | i <- ints], [args `at` i | i <- feats])
125135

126136
mergeRules :: [RawRule] -> [RawRule]
127137
mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
@@ -153,16 +163,20 @@ mkMorphoDict env =
153163
_ | length (nub (map tail fls)) == length fls -> shrinkMore (map tail fls)
154164
_ -> fls
155165

156-
157-
mkFun = showCId . mkCId . concat . intersperse "_"
166+
-- >>> mkFun ["hello", "world", "hello friends", "hello-all"]
167+
-- "hello_world_hello_friends_hello_all"
168+
mkFun :: [String] -> String -- if word contains space or hyphen, replace with underscore
169+
mkFun = showCId . mkCId . concat . intersperse "_" . concatMap (words . removeHyphen)
170+
where
171+
removeHyphen [] = []
172+
removeHyphen ['-'] = ['-'] -- If hyphen is the last character, it's usually meaningful, leave it
173+
removeHyphen ('-':cs) = ' ' : removeHyphen cs
174+
removeHyphen (c:cs) = c : removeHyphen cs
158175

159176
quote s = "\"" ++ s ++ "\""
160177

161-
162-
163-
164178
{- ---- let us ignore this
165-
findCompounds :: [RuleData] -> [RuleData]
179+
findCompounds :: [RuleData] -> [RuleData]
166180
findCompounds = getCompounds . sortOn cat_orthrevforms
167181
168182
cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
@@ -171,9 +185,9 @@ quote s = "\"" ++ s ++ "\""
171185
revstem = head . snd . cat_revforms
172186
wforms (_,(_,_:forms)) = forms
173187
174-
getCompounds :: [RuleData] -> [RuleData]
188+
getCompounds :: [RuleData] -> [RuleData]
175189
getCompounds fls = case fls of
176-
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
190+
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
177191
fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
178192
([],_:_) -> markWith fl [] : getCompounds fls2
179193
(fls1,fls3) -> markWith fl [] : map (markCompound fl) fls1 ++ getCompounds fls3
@@ -188,7 +202,7 @@ quote s = "\"" ++ s ++ "\""
188202
189203
isPrefixWord x xy =
190204
length suff > 1 && ---- compound first part must be at least two letters long
191-
any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit
205+
any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit
192206
isPrefixOf x xy ---- and of course be a prefix
193207
where
194208
suff = drop (length x) xy
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
N : N mkN 0 1 2 4 7 13 14 16 17 19
2+
A : A mkA' 0 1 2 4 7 13 14 16 17 19
3+
V : V mkV 0 17 19 22 43 49 23 25 31 58 94 37
4+
V2 : V mkV 0 17 19 22 43 49 23 25 31 58 94 37
5+
Adv : Adv mkAdv 0
6+
Prep : Prep mkPrep 0

0 commit comments

Comments
 (0)