Skip to content

Commit 35e88f6

Browse files
committed
Merge branch 'release-1.3.2'
2 parents 7f85c65 + 7b3e59a commit 35e88f6

File tree

105 files changed

+1125
-1405
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+1125
-1405
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ You can include the WInte.r framework via the following Maven dependency:
2727
<dependency>
2828
<groupId>de.uni_mannheim.informatik.dws</groupId>
2929
<artifactId>winter</artifactId>
30-
<version>1.3.1</version>
30+
<version>1.3.2</version>
3131
</dependency>
3232
```
3333

winter-extensions/winter-metanome/metanome_integration/pom.xml

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
<groupId>de.uni_mannheim.informatik.dws.winter</groupId>
66
<artifactId>metanome_integration</artifactId>
7-
<version>1.0</version>
7+
<version>1.1</version>
88
<packaging>jar</packaging>
99

1010
<name>metanome_integration</name>
@@ -43,13 +43,13 @@
4343
<dependency>
4444
<groupId>junit</groupId>
4545
<artifactId>junit</artifactId>
46-
<version>3.8.1</version>
46+
<version>4.12</version>
4747
<scope>test</scope>
4848
</dependency>
4949
<dependency>
5050
<groupId>de.uni_mannheim.informatik.dws</groupId>
5151
<artifactId>winter</artifactId>
52-
<version>1.3</version>
52+
<version>1.3.2</version>
5353
</dependency>
5454
<dependency>
5555
<!-- use this option when building: -Xss10M -->
@@ -62,11 +62,6 @@
6262
<artifactId>algorithm_integration</artifactId>
6363
<version>1.1-SNAPSHOT</version>
6464
</dependency>
65-
<dependency>
66-
<groupId>de.metanome.algorithms.tane</groupId>
67-
<artifactId>TANE-approximate</artifactId>
68-
<version>1.0</version>
69-
</dependency>
7065
<dependency>
7166
<groupId>de.metanome.algorithms.hyfd</groupId>
7267
<artifactId>HyFD</artifactId>

winter-extensions/winter-metanome/metanome_integration/src/main/java/de/uni_mannheim/informatik/dws/winter/webtables/FunctionalDependencyDiscovery.java

Lines changed: 155 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,11 @@
1111
*/
1212
package de.uni_mannheim.informatik.dws.winter.webtables;
1313

14-
14+
import java.io.BufferedReader;
15+
import java.io.BufferedWriter;
1516
import java.io.File;
17+
import java.io.FileWriter;
18+
import java.io.InputStreamReader;
1619
import java.io.PrintStream;
1720
import java.util.Collection;
1821
import java.util.HashMap;
@@ -23,15 +26,18 @@
2326
import java.util.Map;
2427
import java.util.Set;
2528

29+
// import de.hpi.isg.pyro.algorithms.Pyro;
2630
import de.metanome.algorithm_integration.AlgorithmExecutionException;
2731
import de.metanome.algorithm_integration.ColumnIdentifier;
2832
import de.metanome.algorithm_integration.input.RelationalInputGenerator;
2933
import de.metanome.algorithm_integration.result_receiver.ColumnNameMismatchException;
3034
import de.metanome.algorithm_integration.result_receiver.CouldNotReceiveResultException;
3135
import de.metanome.algorithm_integration.result_receiver.FunctionalDependencyResultReceiver;
36+
// import de.metanome.algorithm_integration.result_receiver.UniqueColumnCombinationResultReceiver;
3237
import de.metanome.algorithm_integration.results.FunctionalDependency;
38+
// import de.metanome.algorithm_integration.results.UniqueColumnCombination;
3339
import de.metanome.algorithms.hyfd.HyFD;
34-
import de.metanome.algorithms.tane.TaneAlgorithm;
40+
// import de.metanome.algorithms.tane.TaneAlgorithm;
3541
import de.uni_mannheim.informatik.dws.winter.model.Pair;
3642
import de.uni_mannheim.informatik.dws.winter.utils.StringUtils;
3743
import de.uni_mannheim.informatik.dws.winter.utils.query.Q;
@@ -217,102 +223,149 @@ public static void calculcateFunctionalDependencies(Collection<Table> tables, Fi
217223

218224
}
219225

220-
public static void calculateApproximateFunctionalDependencies(Collection<Table> tables, File csvLocation, double errorThreshold) throws Exception {
221-
PrintStream tmp = new PrintStream(new File("TANE.out"));
222-
final PrintStream out = System.out;
223-
224-
try {
225-
// calculate functional dependencies
226-
CSVTableWriter csvWriter = new CSVTableWriter();
227-
for(Table t : tables) {
228-
out.println(String.format("[calculateApproximateFunctionalDependencies] calculating functional dependencies for table #%d %s {%s}",
229-
t.getTableId(),
230-
t.getPath(),
231-
StringUtils.join(Q.project(t.getColumns(), new TableColumn.ColumnHeaderProjection()), ",")));
232-
233-
File tableAsCsv = csvWriter.write(t, new File(csvLocation, t.getPath()));
234-
235-
System.setOut(tmp);
236-
237-
Map<Set<TableColumn>, Set<TableColumn>> fds = calculateApproximateFunctionalDependencies(t, tableAsCsv, errorThreshold);
238-
t.getSchema().setFunctionalDependencies(fds);
239-
Set<Set<TableColumn>> candidateKeys = listCandidateKeys(t);
240-
241-
242-
243-
if(candidateKeys.size()==0) {
244-
candidateKeys.add(new HashSet<>(t.getColumns()));
226+
public static File taneRoot = null;
227+
228+
public static void calculateApproximateFunctionalDependencies(Collection<Table> tables, File csvLocation, double errorThreshold) throws Exception {
229+
CSVTableWriter csvWriter = new CSVTableWriter();
230+
File taneDataLocation = new File(taneRoot, "original");
231+
File taneDescriptionLocation = new File(taneRoot, "descriptions");
232+
// File taneExec = new File(taneLocation, "bin/taneg3");
233+
// File tanePrepare = new File(taneLocation, "bin/select.perl");
234+
for(Table t : tables) {
235+
System.out.println(String.format("[calculateApproximateFunctionalDependencies] calculating functional dependencies for table #%d %s {%s}",
236+
t.getTableId(),
237+
t.getPath(),
238+
StringUtils.join(Q.project(t.getColumns(), new TableColumn.ColumnHeaderProjection()), ",")));
239+
240+
// write file
241+
// File tableAsCsv = csvWriter.write(t, new File(taneDataLocation, t.getPath()));
242+
File tableAsCsv = new File(taneDataLocation, t.getPath());
243+
BufferedWriter w = new BufferedWriter(new FileWriter(tableAsCsv));
244+
for(TableRow r : t.getRows()) {
245+
Object[] values = r.getValueArray();
246+
for(int i = 0; i < values.length; i++) {
247+
Object o = values[i];
248+
if(i>0) {
249+
w.write(",");
250+
}
251+
if(o!=null) {
252+
w.write(o.toString().replace(",", ""));
253+
}
245254
}
246-
t.getSchema().setCandidateKeys(candidateKeys);
255+
w.write("\n");
247256
}
248-
} catch(AlgorithmExecutionException e) {
249-
throw new Exception(e.getMessage());
250-
} finally {
251-
System.setOut(out);
252-
}
253-
254-
}
257+
w.close();
258+
259+
// write description
260+
String descriptionFileName = t.getPath() + ".dsc";
261+
File description = new File(taneDescriptionLocation, descriptionFileName);
262+
w = new BufferedWriter(new FileWriter(description));
263+
w.write("Umask = 007\n");
264+
w.write(String.format("DataIn = ../original/%s\n", tableAsCsv.getName()));
265+
w.write("RemoveDuplicates = OFF\nAttributesOut = $BASENAME.atr\nStandardOut = ../data/$BASENAME.dat\nSavnikFlachOut = ../data/$BASENAME.rel\nNOOFDUPLICATES=1\n");
266+
w.close();
267+
268+
// prepare dataset
269+
String cmd = "../bin/select.perl ../descriptions/" + descriptionFileName;
270+
System.out.println(String.format("%s$ %s", taneDataLocation.getAbsolutePath(), cmd));
271+
Process p = Runtime.getRuntime().exec(cmd, null, taneDataLocation);
272+
String line = null;
273+
BufferedReader r = null;
274+
r = new BufferedReader(new InputStreamReader(p.getInputStream()));
275+
while((line = r.readLine()) != null) {
276+
System.out.println(line);
277+
}
278+
r.close();
279+
r = new BufferedReader(new InputStreamReader(p.getErrorStream()));
280+
while((line = r.readLine()) != null) {
281+
System.out.println(line);
282+
}
283+
r.close();
284+
285+
// run tane
286+
String nameWithoutExtension = t.getPath().replaceAll("\\..{3,4}$", "");
287+
File dataLocation = new File(taneRoot, "data/" + nameWithoutExtension + ".dat");
288+
cmd = String.format("./bin/taneg3 11 %d %d %s %f", t.getRows().size(), t.getColumns().size(), dataLocation.getAbsolutePath(), errorThreshold);
289+
System.out.println(String.format("%s$ %s", taneRoot.getAbsolutePath(), cmd));
290+
p = Runtime.getRuntime().exec(cmd, null, taneRoot);
291+
292+
Map<Set<TableColumn>, Set<TableColumn>> functionalDependencies = new HashMap<>();
293+
Set<Set<TableColumn>> keys = new HashSet<>();
294+
r = new BufferedReader(new InputStreamReader(p.getInputStream()));
295+
while((line = r.readLine()) != null) {
296+
System.out.println(line);
297+
// FDs lines always start with a number or ->
298+
String[] values = line.split("\\s");
299+
boolean isFdLine = false;
300+
301+
if(line.startsWith("->")) {
302+
isFdLine = true;
303+
} else {
304+
try {
305+
Integer.parseInt(values[0]);
306+
isFdLine = true;
307+
} catch(NumberFormatException ex) { isFdLine = false; }
308+
}
255309

256-
public static Map<Set<TableColumn>, Set<TableColumn>> calculateApproximateFunctionalDependencies(final Table t, File tableAsCsv, double errorThreshold) throws Exception {
257-
TaneAlgorithm tane = new TaneAlgorithm();
258-
tane.setErrorThreshold(errorThreshold);
259-
260-
final Map<Set<TableColumn>, Set<TableColumn>> functionalDependencies = new HashMap<>();
261-
262-
try {
263-
RelationalInputGenerator input = new WebTableFileInputGenerator(tableAsCsv);
264-
tane.setRelationalInputConfigurationValue(TaneAlgorithm.INPUT_TAG, input);
265-
tane.setResultReceiver(new FunctionalDependencyResultReceiver() {
266-
267-
@Override
268-
public void receiveResult(FunctionalDependency arg0)
269-
throws CouldNotReceiveResultException, ColumnNameMismatchException {
270-
271-
synchronized (this) {
272-
273-
310+
if(isFdLine) {
274311
Set<TableColumn> det = new HashSet<>();
275-
276-
// identify determinant
277-
for(ColumnIdentifier ci : arg0.getDeterminant().getColumnIdentifiers()) {
278-
Integer colIdx = Integer.parseInt(ci.getColumnIdentifier());
279-
280-
det.add(t.getSchema().get(colIdx));
312+
TableColumn dep = null;
313+
314+
boolean depStart = false;
315+
for(int i = 0; i < values.length; i++) {
316+
if(depStart) {
317+
int idx = Integer.parseInt(values[i]) - 1;
318+
dep = t.getSchema().get(idx);
319+
break;
320+
} else {
321+
if("->".equals(values[i])) {
322+
depStart = true;
323+
} else {
324+
int idx = Integer.parseInt(values[i]) - 1;
325+
det.add(t.getSchema().get(idx));
326+
}
327+
}
281328
}
282329

283-
// add dependant
284-
Set<TableColumn> dep = null;
330+
Set<TableColumn> mergedDep = null;
285331
// check if we already have a dependency with the same determinant
286332
if(functionalDependencies.containsKey(det)) {
287333
// if so, we add the dependent to the existing dependency
288-
dep = functionalDependencies.get(det);
334+
mergedDep = functionalDependencies.get(det);
289335
}
290-
if(dep==null) {
336+
if(mergedDep==null) {
291337
// otherwise, we create a new dependency
292-
dep = new HashSet<>();
293-
functionalDependencies.put(det, dep);
338+
mergedDep = new HashSet<>();
339+
functionalDependencies.put(det, mergedDep);
294340
}
295-
Integer colIdx = Integer.parseInt(arg0.getDependant().getColumnIdentifier());
296-
dep.add(t.getSchema().get(colIdx));
341+
mergedDep.add(dep);
297342

343+
System.out.println(String.format("{%s}->{%s}",
344+
StringUtils.join(Q.project(det, (c)->c.getHeader()), ","),
345+
StringUtils.join(Q.project(mergedDep, (c)->c.getHeader()), ",")
346+
));
347+
348+
if(line.contains("key")) {
349+
keys.add(det);
298350
}
299351
}
300-
301-
@Override
302-
public Boolean acceptedResult(FunctionalDependency arg0) {
303-
return true;
304-
}
305-
});
352+
}
353+
r.close();
354+
r = new BufferedReader(new InputStreamReader(p.getErrorStream()));
355+
while((line = r.readLine()) != null) {
356+
System.out.println(line);
357+
}
358+
r.close();
306359

307-
tane.execute();
308-
} catch(AlgorithmExecutionException e) {
309-
throw new Exception(e.getMessage());
360+
t.getSchema().setFunctionalDependencies(functionalDependencies);
361+
t.getSchema().setCandidateKeys(keys);
310362
}
311-
312-
return functionalDependencies;
363+
}
364+
public static Map<Set<TableColumn>, Set<TableColumn>> calculateFunctionalDependencies(final Table t, File tableAsCsv) throws Exception {
365+
return calculateFunctionalDependencies(t, tableAsCsv, null);
313366
}
314367

315-
public static Map<Set<TableColumn>, Set<TableColumn>> calculateFunctionalDependencies(final Table t, File tableAsCsv) throws Exception {
368+
public static Map<Set<TableColumn>, Set<TableColumn>> calculateFunctionalDependencies(final Table t, File tableAsCsv, final Set<Pair<Set<TableColumn>, Set<TableColumn>>> fds) throws Exception {
316369
HyFD dep = new HyFD();
317370
dep.setBooleanConfigurationValue(HyFD.Identifier.VALIDATE_PARALLEL.name(), true);
318371
final Map<Set<TableColumn>, Set<TableColumn>> functionalDependencies = new HashMap<>();
@@ -327,32 +380,32 @@ public void receiveResult(FunctionalDependency arg0)
327380
throws CouldNotReceiveResultException, ColumnNameMismatchException {
328381

329382
synchronized (this) {
383+
Set<TableColumn> det = new HashSet<>();
330384

385+
// identify determinant
386+
for(ColumnIdentifier ci : arg0.getDeterminant().getColumnIdentifiers()) {
387+
Integer colIdx = Integer.parseInt(ci.getColumnIdentifier());
331388

332-
Set<TableColumn> det = new HashSet<>();
333-
334-
// identify determinant
335-
for(ColumnIdentifier ci : arg0.getDeterminant().getColumnIdentifiers()) {
336-
Integer colIdx = Integer.parseInt(ci.getColumnIdentifier());
337-
338-
det.add(t.getSchema().get(colIdx));
339-
}
389+
det.add(t.getSchema().get(colIdx));
390+
}
340391

341-
// add dependant
342-
Set<TableColumn> dep = null;
343-
// check if we already have a dependency with the same determinant
344-
if(functionalDependencies.containsKey(det)) {
345-
// if so, we add the dependent to the existing dependency
346-
dep = functionalDependencies.get(det);
347-
}
348-
if(dep==null) {
349-
// otherwise, we create a new dependency
350-
dep = new HashSet<>();
351-
functionalDependencies.put(det, dep);
352-
}
353-
Integer colIdx = Integer.parseInt(arg0.getDependant().getColumnIdentifier());
354-
dep.add(t.getSchema().get(colIdx));
355-
392+
// add dependant
393+
Set<TableColumn> dep = null;
394+
// check if we already have a dependency with the same determinant
395+
if(functionalDependencies.containsKey(det)) {
396+
// if so, we add the dependent to the existing dependency
397+
dep = functionalDependencies.get(det);
398+
}
399+
if(dep==null) {
400+
// otherwise, we create a new dependency
401+
dep = new HashSet<>();
402+
functionalDependencies.put(det, dep);
403+
}
404+
Integer colIdx = Integer.parseInt(arg0.getDependant().getColumnIdentifier());
405+
dep.add(t.getSchema().get(colIdx));
406+
if(fds!=null) {
407+
fds.add(new Pair<>(det, dep));
408+
}
356409
}
357410
}
358411

0 commit comments

Comments
 (0)