1111 */
1212package de .uni_mannheim .informatik .dws .winter .webtables ;
1313
14-
14+ import java .io .BufferedReader ;
15+ import java .io .BufferedWriter ;
1516import java .io .File ;
17+ import java .io .FileWriter ;
18+ import java .io .InputStreamReader ;
1619import java .io .PrintStream ;
1720import java .util .Collection ;
1821import java .util .HashMap ;
2326import java .util .Map ;
2427import java .util .Set ;
2528
29+ // import de.hpi.isg.pyro.algorithms.Pyro;
2630import de .metanome .algorithm_integration .AlgorithmExecutionException ;
2731import de .metanome .algorithm_integration .ColumnIdentifier ;
2832import de .metanome .algorithm_integration .input .RelationalInputGenerator ;
2933import de .metanome .algorithm_integration .result_receiver .ColumnNameMismatchException ;
3034import de .metanome .algorithm_integration .result_receiver .CouldNotReceiveResultException ;
3135import de .metanome .algorithm_integration .result_receiver .FunctionalDependencyResultReceiver ;
36+ // import de.metanome.algorithm_integration.result_receiver.UniqueColumnCombinationResultReceiver;
3237import de .metanome .algorithm_integration .results .FunctionalDependency ;
38+ // import de.metanome.algorithm_integration.results.UniqueColumnCombination;
3339import de .metanome .algorithms .hyfd .HyFD ;
34- import de .metanome .algorithms .tane .TaneAlgorithm ;
40+ // import de.metanome.algorithms.tane.TaneAlgorithm;
3541import de .uni_mannheim .informatik .dws .winter .model .Pair ;
3642import de .uni_mannheim .informatik .dws .winter .utils .StringUtils ;
3743import de .uni_mannheim .informatik .dws .winter .utils .query .Q ;
@@ -217,102 +223,149 @@ public static void calculcateFunctionalDependencies(Collection<Table> tables, Fi
217223
218224 }
219225
220- public static void calculateApproximateFunctionalDependencies (Collection <Table > tables , File csvLocation , double errorThreshold ) throws Exception {
221- PrintStream tmp = new PrintStream (new File ("TANE.out" ));
222- final PrintStream out = System .out ;
223-
224- try {
225- // calculate functional dependencies
226- CSVTableWriter csvWriter = new CSVTableWriter ();
227- for (Table t : tables ) {
228- out .println (String .format ("[calculateApproximateFunctionalDependencies] calculating functional dependencies for table #%d %s {%s}" ,
229- t .getTableId (),
230- t .getPath (),
231- StringUtils .join (Q .project (t .getColumns (), new TableColumn .ColumnHeaderProjection ()), "," )));
232-
233- File tableAsCsv = csvWriter .write (t , new File (csvLocation , t .getPath ()));
234-
235- System .setOut (tmp );
236-
237- Map <Set <TableColumn >, Set <TableColumn >> fds = calculateApproximateFunctionalDependencies (t , tableAsCsv , errorThreshold );
238- t .getSchema ().setFunctionalDependencies (fds );
239- Set <Set <TableColumn >> candidateKeys = listCandidateKeys (t );
240-
241-
242-
243- if (candidateKeys .size ()==0 ) {
244- candidateKeys .add (new HashSet <>(t .getColumns ()));
226+ public static File taneRoot = null ;
227+
228+ public static void calculateApproximateFunctionalDependencies (Collection <Table > tables , File csvLocation , double errorThreshold ) throws Exception {
229+ CSVTableWriter csvWriter = new CSVTableWriter ();
230+ File taneDataLocation = new File (taneRoot , "original" );
231+ File taneDescriptionLocation = new File (taneRoot , "descriptions" );
232+ // File taneExec = new File(taneLocation, "bin/taneg3");
233+ // File tanePrepare = new File(taneLocation, "bin/select.perl");
234+ for (Table t : tables ) {
235+ System .out .println (String .format ("[calculateApproximateFunctionalDependencies] calculating functional dependencies for table #%d %s {%s}" ,
236+ t .getTableId (),
237+ t .getPath (),
238+ StringUtils .join (Q .project (t .getColumns (), new TableColumn .ColumnHeaderProjection ()), "," )));
239+
240+ // write file
241+ // File tableAsCsv = csvWriter.write(t, new File(taneDataLocation, t.getPath()));
242+ File tableAsCsv = new File (taneDataLocation , t .getPath ());
243+ BufferedWriter w = new BufferedWriter (new FileWriter (tableAsCsv ));
244+ for (TableRow r : t .getRows ()) {
245+ Object [] values = r .getValueArray ();
246+ for (int i = 0 ; i < values .length ; i ++) {
247+ Object o = values [i ];
248+ if (i >0 ) {
249+ w .write ("," );
250+ }
251+ if (o !=null ) {
252+ w .write (o .toString ().replace ("," , "" ));
253+ }
245254 }
246- t . getSchema (). setCandidateKeys ( candidateKeys );
255+ w . write ( " \n " );
247256 }
248- } catch (AlgorithmExecutionException e ) {
249- throw new Exception (e .getMessage ());
250- } finally {
251- System .setOut (out );
252- }
253-
254- }
257+ w .close ();
258+
259+ // write description
260+ String descriptionFileName = t .getPath () + ".dsc" ;
261+ File description = new File (taneDescriptionLocation , descriptionFileName );
262+ w = new BufferedWriter (new FileWriter (description ));
263+ w .write ("Umask = 007\n " );
264+ w .write (String .format ("DataIn = ../original/%s\n " , tableAsCsv .getName ()));
265+ w .write ("RemoveDuplicates = OFF\n AttributesOut = $BASENAME.atr\n StandardOut = ../data/$BASENAME.dat\n SavnikFlachOut = ../data/$BASENAME.rel\n NOOFDUPLICATES=1\n " );
266+ w .close ();
267+
268+ // prepare dataset
269+ String cmd = "../bin/select.perl ../descriptions/" + descriptionFileName ;
270+ System .out .println (String .format ("%s$ %s" , taneDataLocation .getAbsolutePath (), cmd ));
271+ Process p = Runtime .getRuntime ().exec (cmd , null , taneDataLocation );
272+ String line = null ;
273+ BufferedReader r = null ;
274+ r = new BufferedReader (new InputStreamReader (p .getInputStream ()));
275+ while ((line = r .readLine ()) != null ) {
276+ System .out .println (line );
277+ }
278+ r .close ();
279+ r = new BufferedReader (new InputStreamReader (p .getErrorStream ()));
280+ while ((line = r .readLine ()) != null ) {
281+ System .out .println (line );
282+ }
283+ r .close ();
284+
285+ // run tane
286+ String nameWithoutExtension = t .getPath ().replaceAll ("\\ ..{3,4}$" , "" );
287+ File dataLocation = new File (taneRoot , "data/" + nameWithoutExtension + ".dat" );
288+ cmd = String .format ("./bin/taneg3 11 %d %d %s %f" , t .getRows ().size (), t .getColumns ().size (), dataLocation .getAbsolutePath (), errorThreshold );
289+ System .out .println (String .format ("%s$ %s" , taneRoot .getAbsolutePath (), cmd ));
290+ p = Runtime .getRuntime ().exec (cmd , null , taneRoot );
291+
292+ Map <Set <TableColumn >, Set <TableColumn >> functionalDependencies = new HashMap <>();
293+ Set <Set <TableColumn >> keys = new HashSet <>();
294+ r = new BufferedReader (new InputStreamReader (p .getInputStream ()));
295+ while ((line = r .readLine ()) != null ) {
296+ System .out .println (line );
297+ // FDs lines always start with a number or ->
298+ String [] values = line .split ("\\ s" );
299+ boolean isFdLine = false ;
300+
301+ if (line .startsWith ("->" )) {
302+ isFdLine = true ;
303+ } else {
304+ try {
305+ Integer .parseInt (values [0 ]);
306+ isFdLine = true ;
307+ } catch (NumberFormatException ex ) { isFdLine = false ; }
308+ }
255309
256- public static Map <Set <TableColumn >, Set <TableColumn >> calculateApproximateFunctionalDependencies (final Table t , File tableAsCsv , double errorThreshold ) throws Exception {
257- TaneAlgorithm tane = new TaneAlgorithm ();
258- tane .setErrorThreshold (errorThreshold );
259-
260- final Map <Set <TableColumn >, Set <TableColumn >> functionalDependencies = new HashMap <>();
261-
262- try {
263- RelationalInputGenerator input = new WebTableFileInputGenerator (tableAsCsv );
264- tane .setRelationalInputConfigurationValue (TaneAlgorithm .INPUT_TAG , input );
265- tane .setResultReceiver (new FunctionalDependencyResultReceiver () {
266-
267- @ Override
268- public void receiveResult (FunctionalDependency arg0 )
269- throws CouldNotReceiveResultException , ColumnNameMismatchException {
270-
271- synchronized (this ) {
272-
273-
310+ if (isFdLine ) {
274311 Set <TableColumn > det = new HashSet <>();
275-
276- // identify determinant
277- for (ColumnIdentifier ci : arg0 .getDeterminant ().getColumnIdentifiers ()) {
278- Integer colIdx = Integer .parseInt (ci .getColumnIdentifier ());
279-
280- det .add (t .getSchema ().get (colIdx ));
312+ TableColumn dep = null ;
313+
314+ boolean depStart = false ;
315+ for (int i = 0 ; i < values .length ; i ++) {
316+ if (depStart ) {
317+ int idx = Integer .parseInt (values [i ]) - 1 ;
318+ dep = t .getSchema ().get (idx );
319+ break ;
320+ } else {
321+ if ("->" .equals (values [i ])) {
322+ depStart = true ;
323+ } else {
324+ int idx = Integer .parseInt (values [i ]) - 1 ;
325+ det .add (t .getSchema ().get (idx ));
326+ }
327+ }
281328 }
282329
283- // add dependant
284- Set <TableColumn > dep = null ;
330+ Set <TableColumn > mergedDep = null ;
285331 // check if we already have a dependency with the same determinant
286332 if (functionalDependencies .containsKey (det )) {
287333 // if so, we add the dependent to the existing dependency
288- dep = functionalDependencies .get (det );
334+ mergedDep = functionalDependencies .get (det );
289335 }
290- if (dep ==null ) {
336+ if (mergedDep ==null ) {
291337 // otherwise, we create a new dependency
292- dep = new HashSet <>();
293- functionalDependencies .put (det , dep );
338+ mergedDep = new HashSet <>();
339+ functionalDependencies .put (det , mergedDep );
294340 }
295- Integer colIdx = Integer .parseInt (arg0 .getDependant ().getColumnIdentifier ());
296- dep .add (t .getSchema ().get (colIdx ));
341+ mergedDep .add (dep );
297342
343+ System .out .println (String .format ("{%s}->{%s}" ,
344+ StringUtils .join (Q .project (det , (c )->c .getHeader ()), "," ),
345+ StringUtils .join (Q .project (mergedDep , (c )->c .getHeader ()), "," )
346+ ));
347+
348+ if (line .contains ("key" )) {
349+ keys .add (det );
298350 }
299351 }
300-
301- @ Override
302- public Boolean acceptedResult (FunctionalDependency arg0 ) {
303- return true ;
304- }
305- });
352+ }
353+ r .close ();
354+ r = new BufferedReader (new InputStreamReader (p .getErrorStream ()));
355+ while ((line = r .readLine ()) != null ) {
356+ System .out .println (line );
357+ }
358+ r .close ();
306359
307- tane .execute ();
308- } catch (AlgorithmExecutionException e ) {
309- throw new Exception (e .getMessage ());
360+ t .getSchema ().setFunctionalDependencies (functionalDependencies );
361+ t .getSchema ().setCandidateKeys (keys );
310362 }
311-
312- return functionalDependencies ;
363+ }
364+ public static Map <Set <TableColumn >, Set <TableColumn >> calculateFunctionalDependencies (final Table t , File tableAsCsv ) throws Exception {
365+ return calculateFunctionalDependencies (t , tableAsCsv , null );
313366 }
314367
315- public static Map <Set <TableColumn >, Set <TableColumn >> calculateFunctionalDependencies (final Table t , File tableAsCsv ) throws Exception {
368+ public static Map <Set <TableColumn >, Set <TableColumn >> calculateFunctionalDependencies (final Table t , File tableAsCsv , final Set < Pair < Set < TableColumn >, Set < TableColumn >>> fds ) throws Exception {
316369 HyFD dep = new HyFD ();
317370 dep .setBooleanConfigurationValue (HyFD .Identifier .VALIDATE_PARALLEL .name (), true );
318371 final Map <Set <TableColumn >, Set <TableColumn >> functionalDependencies = new HashMap <>();
@@ -327,32 +380,32 @@ public void receiveResult(FunctionalDependency arg0)
327380 throws CouldNotReceiveResultException , ColumnNameMismatchException {
328381
329382 synchronized (this ) {
383+ Set <TableColumn > det = new HashSet <>();
330384
385+ // identify determinant
386+ for (ColumnIdentifier ci : arg0 .getDeterminant ().getColumnIdentifiers ()) {
387+ Integer colIdx = Integer .parseInt (ci .getColumnIdentifier ());
331388
332- Set <TableColumn > det = new HashSet <>();
333-
334- // identify determinant
335- for (ColumnIdentifier ci : arg0 .getDeterminant ().getColumnIdentifiers ()) {
336- Integer colIdx = Integer .parseInt (ci .getColumnIdentifier ());
337-
338- det .add (t .getSchema ().get (colIdx ));
339- }
389+ det .add (t .getSchema ().get (colIdx ));
390+ }
340391
341- // add dependant
342- Set <TableColumn > dep = null ;
343- // check if we already have a dependency with the same determinant
344- if (functionalDependencies .containsKey (det )) {
345- // if so, we add the dependent to the existing dependency
346- dep = functionalDependencies .get (det );
347- }
348- if (dep ==null ) {
349- // otherwise, we create a new dependency
350- dep = new HashSet <>();
351- functionalDependencies .put (det , dep );
352- }
353- Integer colIdx = Integer .parseInt (arg0 .getDependant ().getColumnIdentifier ());
354- dep .add (t .getSchema ().get (colIdx ));
355-
392+ // add dependant
393+ Set <TableColumn > dep = null ;
394+ // check if we already have a dependency with the same determinant
395+ if (functionalDependencies .containsKey (det )) {
396+ // if so, we add the dependent to the existing dependency
397+ dep = functionalDependencies .get (det );
398+ }
399+ if (dep ==null ) {
400+ // otherwise, we create a new dependency
401+ dep = new HashSet <>();
402+ functionalDependencies .put (det , dep );
403+ }
404+ Integer colIdx = Integer .parseInt (arg0 .getDependant ().getColumnIdentifier ());
405+ dep .add (t .getSchema ().get (colIdx ));
406+ if (fds !=null ) {
407+ fds .add (new Pair <>(det , dep ));
408+ }
356409 }
357410 }
358411
0 commit comments