Skip to content

Commit de43cde

Browse files
committed
Initial implementation of support for HTML entities for Julia
1 parent c440315 commit de43cde

File tree

10 files changed

+2396
-10
lines changed

10 files changed

+2396
-10
lines changed

.travis.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ julia:
99
notifications:
1010
email: false
1111
# uncomment the following lines to override the default test script
12-
#script:
13-
# - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
14-
# - julia -e 'Pkg.clone(pwd()); Pkg.build("HTML_Entities"); Pkg.test("HTML_Entities"; coverage=true)'
12+
script:
13+
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
14+
- julia -e 'Pkg.clone("https://github.com/JuliaString/StrTables.jl.git"); Pkg.clone(pwd()); Pkg.build("HTML_Entities"); Pkg.test("HTML_Entities"; coverage=true)'
1515
after_success:
1616
# push coverage results to Coveralls
1717
- julia -e 'cd(Pkg.dir("HTML_Entities")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'

LICENSE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
The HTML_Entities.jl package is licensed under the MIT "Expat" License:
22

3-
> Copyright (c) 2017: ScottPJones.
3+
> Copyright (c) 2017: Gandalf Software, Inc. (Scott Paul Jones) and other contributors
44
>
55
> Permission is hereby granted, free of charge, to any person obtaining a copy
66
> of this software and associated documentation files (the "Software"), to deal

README.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
# HTML_Entities
22

3-
[![Build Status](https://travis-ci.org/ScottPJones/HTML_Entities.jl.svg?branch=master)](https://travis-ci.org/ScottPJones/HTML_Entities.jl)
3+
[![Build Status](https://travis-ci.org/JuliaString/HTML_Entities.jl.svg?branch=master)](https://travis-ci.org/JuliaString/HTML_Entities.jl)
44

5-
[![Coverage Status](https://coveralls.io/repos/ScottPJones/HTML_Entities.jl/badge.svg?branch=master&service=github)](https://coveralls.io/github/ScottPJones/HTML_Entities.jl?branch=master)
5+
[![Coverage Status](https://coveralls.io/repos/JuliaString/HTML_Entities.jl/badge.svg?branch=master&service=github)](https://coveralls.io/github/JuliaString/HTML_Entities.jl?branch=master)
6+
7+
[![codecov.io](http://codecov.io/github/JuliaString/HTML_Entities.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaString/HTML_Entities.jl?branch=master)
8+
9+
HTML_Entities.jl: Support for using HTML entity names for characters
10+
====================================================================
11+
12+
This builds tables for looking up HTML entity names and returning the Unicode character(s),
13+
looking up a character or pair of characters and finding HTML names that return it/them,
14+
and finding all of the HTML name completions for a particular string, if any.
615

7-
[![codecov.io](http://codecov.io/github/ScottPJones/HTML_Entities.jl/coverage.svg?branch=master)](http://codecov.io/github/ScottPJones/HTML_Entities.jl?branch=master)

REQUIRE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
julia 0.5
2+
StrTables

data/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Data files for HTML_Entities
2+
3+
# Input:
4+
5+
# Output:
6+
7+
* ``html.dat``

deps/build.jl

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# License is MIT: https://github.com/JuliaString/LaTeX_Entities/LICENSE.md
2+
#
3+
# Mapping from HTML entities to the corresponding Unicode codepoint.
4+
5+
println("Running HTML entity build in ", pwd())
6+
7+
using StrTables
8+
9+
VER = UInt32(1)
10+
11+
include("htmlnames.jl")
12+
13+
const disp = [false]
14+
15+
const fname = "html.dat"
16+
const datapath = joinpath(Pkg.dir(), "HTML_Entities", "data")
17+
18+
const empty_str = ""
19+
20+
function sortsplit!{T}(index::Vector{UInt16}, vec::Vector{Tuple{T, UInt16}}, base)
21+
sort!(vec)
22+
len = length(vec)
23+
valvec = Vector{T}(len)
24+
indvec = Vector{UInt16}(len)
25+
for (i, val) in enumerate(vec)
26+
valvec[i], ind = val
27+
indvec[i] = ind
28+
index[ind] = UInt16(base + i)
29+
end
30+
base += len
31+
valvec, indvec, base
32+
end
33+
34+
function make_tables()
35+
symnam = Vector{String}()
36+
symval = Vector{String}()
37+
38+
for pair in htmlonechar
39+
push!(symnam, pair[1])
40+
push!(symval, string(Char(pair[2])))
41+
end
42+
for pair in htmlnonbmp
43+
push!(symnam, pair[1])
44+
push!(symval, string(Char(0x10000+pair[2])))
45+
end
46+
for pair in htmltwochar
47+
push!(symnam, pair[1])
48+
p = pair[2]
49+
push!(symval, string(Char(p[1]), Char(p[2])))
50+
end
51+
52+
# We want to build a table of all the names, sort them, then create a StrTable out of them
53+
srtnam = sortperm(symnam)
54+
srtval = symval[srtnam] # Values, sorted the same as srtnam
55+
56+
# BMP characters
57+
l16 = Vector{Tuple{UInt16, UInt16}}()
58+
# non-BMP characters (in range 0x10000 - 0x1ffff)
59+
l32 = Vector{Tuple{UInt16, UInt16}}()
60+
# two characters packed into UInt32, first character in high 16-bits
61+
l2c = Vector{Tuple{UInt32, UInt16}}()
62+
63+
for i in eachindex(srtnam)
64+
chrs = convert(Vector{Char}, srtval[i])
65+
length(chrs) > 2 && error("Too long sequence of characters $chrs")
66+
if length(chrs) == 2
67+
(chrs[1] > '\uffff' || chrs[2] > '\uffff') &&
68+
error("Character $(chrs[1]) or $(chrs[2]) > 0xffff")
69+
push!(l2c, (chrs[1]%UInt32<<16 | chrs[2]%UInt32, i))
70+
elseif chrs[1] > '\U1ffff'
71+
error("Character $(chrs[1]) too large: $(UInt32(chrs[1]))")
72+
elseif chrs[1] > '\uffff'
73+
push!(l32, ((chrs[1]-0x10000)%UInt32, i))
74+
else
75+
push!(l16, (chrs[1]%UInt16, i))
76+
end
77+
end
78+
79+
# We now have 3 vectors, for single BMP characters, for non-BMP characters, and for 2 BMP chars
80+
# each has the value and a index into the name table
81+
# We need to create a vector the same size as the name table, that gives the index
82+
# of into one of the three tables, in order to go from names to 1 or 2 output characters
83+
# We also need, for each of the 3 tables, a sorted vector that goes from the indices
84+
# in each table to the index into the name table (so that we can find multiple names for
85+
# each character)
86+
87+
indvec = Vector{UInt16}(length(srtnam))
88+
vec16, ind16, base32 = sortsplit!(indvec, l16, 0)
89+
vec32, ind32, base2c = sortsplit!(indvec, l32, base32)
90+
vec2c, ind2c, basefn = sortsplit!(indvec, l2c, base2c)
91+
92+
(VER, string(now()), "loaded from htmlnames.jl",
93+
base32%UInt32, base2c%UInt32, StrTable(symnam[srtnam]), indvec,
94+
vec16, ind16, vec32, ind32, vec2c, ind2c)
95+
end
96+
97+
println("Creating tables")
98+
tup = make_tables()
99+
savfile = joinpath(datapath, fname)
100+
println("Saving tables to ", savfile)
101+
StrTables.save(savfile, tup)
102+
println("Done")

0 commit comments

Comments
 (0)