Skip to content

Commit d8935d8

Browse files
committed
Added a new exported function readdlmsql() which will read a delimited file (.csv, .tsv, .wsv, etc.) and allows a simultaneous pass of an sql statement to return a DataFrame immmediately.
1 parent 255afe7 commit d8935d8

File tree

3 files changed

+815040
-14
lines changed

3 files changed

+815040
-14
lines changed

src/Sqlite.jl

Lines changed: 152 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ module Sqlite
22

33
using DataFrames
44

5-
export sqlitedb
5+
export sqlitedb, readdlmsql
66

77
include("Sqlite_consts.jl")
88
include("Sqlite_api.jl")
@@ -33,6 +33,9 @@ typealias TableInput Union(DataFrame,String)
3333
const null_resultset = DataFrame(0)
3434
const null_SqliteDB = SqliteDB("",C_NULL,null_resultset)
3535
sqlitedb = null_SqliteDB #Create default connection = null
36+
const INTrx = r"^\d+$"
37+
const STRINGrx = r"[^eE0-9\.\-\+]"i
38+
const FLOATrx = r"^[+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][+-]?[0-9]+)?$"
3639

3740
#Core Functions
3841
function connect(file::String)
@@ -92,7 +95,7 @@ function query(q::String,conn::SqliteDB=sqlitedb)
9295
#retrieve resultset
9396
while true
9497
for i = 1:ncols
95-
t = sqlite3_column_type(stmt,i-1)
98+
t = Sqlite.sqlite3_column_type(stmt,i-1)
9699
if t == SQLITE3_TEXT
97100
r = bytestring( sqlite3_column_text(stmt,i-1) )
98101
elseif t == SQLITE_FLOAT
@@ -129,15 +132,15 @@ function query(q::String,conn::SqliteDB=sqlitedb)
129132
sqlite3_finalize(stmt)
130133
return (conn.resultset = DataFrame(resultset,Index(colnames)))
131134
end
132-
function createtable(input::TableInput,conn::SqliteDB=sqlitedb;name::String="")
135+
function createtable(input::TableInput,conn::SqliteDB=sqlitedb;name::String="",delim::Char='\0',header::Bool=true,types::Array{DataType,1}=DataType[],infer::Bool=true)
133136
conn == null_SqliteDB && error("[sqlite]: A valid SqliteDB was not specified (and no valid default SqliteDB exists)")
134137
#these 2 calls are for performance
135138
internal_query(conn,"PRAGMA synchronous = OFF")
136139

137140
if typeof(input) == DataFrame
138141
r = df2table(input,conn,name)
139142
else
140-
r = 0 # dlm2table(input,conn,name)
143+
r = dlm2table(input,conn,name,delim,header,types,infer)
141144
end
142145
internal_query(conn,"PRAGMA synchronous = ON")
143146
return r
@@ -161,7 +164,6 @@ function df2table(df::DataFrame,conn::SqliteDB,name::String)
161164
#prepare insert table with parameters for column values
162165
params = chop(repeat("?,",ncols))
163166
stmt, r = internal_query(conn,"insert into $dfname values ($params)",false,false)
164-
sqlite3_reset(stmt)
165167
#bind, step, reset loop for inserting values
166168
for row = 1:nrow(df)
167169
for col = 1:ncols
@@ -191,13 +193,151 @@ function droptable(table::String,conn::SqliteDB=sqlitedb)
191193
return
192194
end
193195
#read raw file direct to sqlite table
194-
# function csv2table()
195-
196-
# end
197-
#read raw file to sqlite table (call csv2table), then run sql statement on table to return df (call to query)
198-
# function readcsvsql()
199-
200-
# end
196+
function dlm2table(file::String,conn::SqliteDB,name::String,delim::Char,header::Bool,types::Array{DataType,1},infer::Bool)
197+
#determine tablename and delimiter
198+
tablename = name
199+
if tablename == ""
200+
tablename = match(r"\w+(?=\.)",file).match
201+
end
202+
delimiter = delim
203+
if delimiter == '\0'
204+
delimiter = ismatch(r"csv$", file) ? ',' : ismatch(r"tsv$", file) ? '\t' : ismatch(r"wsv$", file) ? ' ' : error("Unable to determine separator used in $file")
205+
end
206+
#get column names/types: colnames, ncols, coltypes
207+
f = open(file)
208+
firstrow = split(chomp(readline(f)),delimiter)
209+
ncols = length(firstrow)
210+
if header
211+
colnames = firstrow
212+
else
213+
colnames = String["x$i" for i = 1:ncols]
214+
seekstart(f)
215+
end
216+
if infer
217+
coltypes = Array(DataType,ncols)
218+
check = falses(ncols)
219+
for r in eachline(f)
220+
row = split_quoted(chomp(r),delimiter)
221+
for i = 1:ncols
222+
if !check[i]
223+
if row[i] == "" #null/missing value
224+
continue
225+
elseif ismatch(INTrx,row[i]) #match a plain integer first
226+
colnames[i] *= " INT"; check[i] = true
227+
elseif ismatch(STRINGrx,row[i]) #then check if it's stringy
228+
colnames[i] *= " TEXT"; check[i] = true
229+
elseif ismatch(FLOATrx,row[i]) #if it's not integer or string, check if it's a float
230+
colnames[i] *= " REAL"; check[i] = true
231+
else #if it's still not a float, just make it a string
232+
colnames[i] *= " TEXT"; check[i] = true
233+
end
234+
end
235+
end
236+
sum(check) == ncols && break
237+
end
238+
if sum(check) < ncols
239+
for i = 1:ncols
240+
if !coltypes[i]
241+
coltypes[i] = String
242+
end
243+
end
244+
end
245+
seekstart(f)
246+
header && readline(f)
247+
elseif length(types) > 0
248+
if eltype(types) <: String
249+
for i = 1:ncols
250+
colnames[i] *= " " * types[i]
251+
end
252+
else
253+
for i = 1:ncols
254+
colnames[i] *= types[i] <: Integer ? " INT" : types[i] <: FloatingPoint ? " REAL" : " TEXT"
255+
end
256+
end
257+
end
258+
colnames = join(colnames,',')
259+
internal_query(conn,"create table $tablename ($colnames)")
260+
internal_query(conn,"BEGIN TRANSACTION")
261+
#prepare insert table with parameters for column values
262+
params = chop(repeat("?,",ncols))
263+
stmt, r = internal_query(conn,"insert into $tablename values ($params)",false,false)
264+
#bind, step, reset loop for inserting values
265+
for r in eachline(f)
266+
row = Sqlite.split_quoted(chomp(r),delimiter)
267+
for col = 1:ncols
268+
d = row[col]
269+
Sqlite.sqlite3_bind_text(stmt,col,d,length(d),C_NULL)
270+
end
271+
Sqlite.sqlite3_step(stmt)
272+
Sqlite.sqlite3_reset(stmt)
273+
end
274+
sqlite3_finalize(stmt)
275+
internal_query(conn,"COMMIT")
276+
close(f)
277+
return
278+
end
279+
#read raw file to sqlite table (call dlm2table), then run sql statement on table to return df (call to query)
280+
function readdlmsql(input::String,conn::SqliteDB=sqlitedb;sql::String="select * from file",name::String="file",delim::Char='\0',header::Bool=true,types::Array{DataType,1}=DataType[],infer::Bool=true)
281+
if conn == null_SqliteDB
282+
handle = Array(Ptr{Void},1)
283+
file = tempname()
284+
Sqlite.sqlite3_open(file,handle)
285+
conn = Sqlite.SqliteDB(file,handle[1],Sqlite.null_resultset)
286+
end
287+
createtable(input,conn;name=name,delim=delim,header=header,types=types,infer=infer)
288+
return query(sql,conn)
289+
end
290+
function search_quoted(s::String, c::Char, i::Integer)
291+
if isempty(c)
292+
return 1 <= i <= endof(s)+1 ? i :
293+
i == endof(s)+2 ? 0 :
294+
error(BoundsError)
295+
end
296+
if i < 1 error(BoundsError) end
297+
i = nextind(s,i-1)
298+
while !done(s,i)
299+
d, j = next(s,i)
300+
if d == '"'
301+
i = j
302+
d, j = next(s,i)
303+
while d != '"'
304+
i = j
305+
d, j = next(s,i)
306+
end
307+
end
308+
if contains(c,d)
309+
return i
310+
end
311+
i = j
312+
end
313+
return 0
314+
end
315+
search_quoted(s::String, c::Char) = search_quoted(s,c,start(s))
316+
function split_quoted(str::String, splitter, limit::Integer, keep_empty::Bool)
317+
strs = String[]
318+
i = start(str)
319+
n = endof(str)
320+
r = search_quoted(str,splitter,i)
321+
j, k = first(r), last(r)+1
322+
while 0 < j <= n && length(strs) != limit-1
323+
if i < k
324+
if keep_empty || i < j
325+
push!(strs, str[i:j-1])
326+
end
327+
i = k
328+
end
329+
if k <= j; k = nextind(str,j) end
330+
r = search_quoted(str,splitter,k)
331+
j, k = first(r), last(r)+1
332+
end
333+
if keep_empty || !done(str,i)
334+
push!(strs, str[i:])
335+
end
336+
return strs
337+
end
338+
split_quoted(s::String, spl, n::Integer) = split_quoted(s, spl, n, true)
339+
split_quoted(s::String, spl, keep::Bool) = split_quoted(s, spl, 0, keep)
340+
split_quoted(s::String, spl) = split_quoted(s, spl, 0, true)
201341
end #sqlite module
202342

203343
function sqldf(q::String)

0 commit comments

Comments
 (0)