Skip to content

Commit 6272b0e

Browse files
committed
[SYSTEMDS-3814] Fix invalid rename of csv input to output files
This patch fixes a remaining invalid rename of persistently read input csv files to csv output files, which "deletes" the input file. So far we based this information on the PREAD variable name, but certain assignments loose this information. We now properly capture this information at createvar instructions, preserve them inside all matrices, frames, and tensors, and thus ensure robustness for all kind of programs.
1 parent 5b71a03 commit 6272b0e

File tree

4 files changed

+182
-8
lines changed

4 files changed

+182
-8
lines changed

src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ public enum CacheStatus {
183183

184184
/** The name of HDFS file in which the data is backed up. */
185185
protected String _hdfsFileName = null; // file name and path
186+
protected boolean _isPRead = false; //persistent read, must not be deleted
186187

187188
/**
188189
* Flag that indicates whether or not hdfs file exists.It is used
@@ -285,6 +286,14 @@ public String getFileName() {
285286
return _hdfsFileName;
286287
}
287288

289+
public boolean isPersistentRead() {
290+
return _isPRead;
291+
}
292+
293+
public void setPersistentRead(boolean pread) {
294+
_isPRead = pread;
295+
}
296+
288297
public long getUniqueID() {
289298
return _uniqueID;
290299
}

src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -706,7 +706,7 @@ private void processCreateVariableInstruction(ExecutionContext ec){
706706
case MATRIX: {
707707
String fname = createUniqueFilename();
708708
MatrixObject obj = new MatrixObject(getInput1().getValueType(), fname);
709-
setCacheableDataFields(obj);
709+
setCacheableDataFields(obj, getInput1().getName());
710710
obj.setUpdateType(_updateType);
711711
obj.setMarkForLinCache(true);
712712
ec.setVariable(getInput1().getName(), obj);
@@ -717,14 +717,14 @@ private void processCreateVariableInstruction(ExecutionContext ec){
717717
case TENSOR: {
718718
String fname = createUniqueFilename();
719719
TensorObject obj = new TensorObject(getInput1().getValueType(), fname);
720-
setCacheableDataFields(obj);
720+
setCacheableDataFields(obj, getInput1().getName());
721721
ec.setVariable(getInput1().getName(), obj);
722722
break;
723723
}
724724
case FRAME: {
725725
String fname = createUniqueFilename();
726726
FrameObject fobj = new FrameObject(fname);
727-
setCacheableDataFields(fobj);
727+
setCacheableDataFields(fobj, getInput1().getName());
728728
if( _schema != null )
729729
fobj.setSchema(_schema); //after metadata
730730
ec.setVariable(getInput1().getName(), fobj);
@@ -757,13 +757,14 @@ private String createUniqueFilename(){
757757
return fname;
758758
}
759759

760-
private void setCacheableDataFields(CacheableData<?> obj){
760+
private void setCacheableDataFields(CacheableData<?> obj, String varname){
761761
//clone metadata because it is updated on copy-on-write, otherwise there
762762
//is potential for hidden side effects between variables.
763763
obj.setMetaData((MetaData)metadata.clone());
764764
obj.enableCleanup(!getInput1().getName()
765765
.startsWith(org.apache.sysds.lops.Data.PREAD_PREFIX));
766766
obj.setFileFormatProperties(_formatProperties);
767+
obj.setPersistentRead(varname.startsWith(org.apache.sysds.lops.Data.PREAD_PREFIX));
767768
}
768769

769770
/**
@@ -960,7 +961,7 @@ private void processCastAsMatrixVariableInstruction(ExecutionContext ec) {
960961

961962
/**
962963
* Handler for CastAsFrameVariable instruction
963-
*
964+
*
964965
* @param ec execution context
965966
*/
966967
private void processCastAsFrameVariableInstruction(ExecutionContext ec){
@@ -1018,6 +1019,7 @@ private void processReadInstruction(ExecutionContext ec){
10181019
* @param ec execution context
10191020
*/
10201021
private void processCopyInstruction(ExecutionContext ec) {
1022+
10211023
// get source variable
10221024
Data dd = ec.getVariable(getInput1().getName());
10231025

@@ -1142,9 +1144,7 @@ private void writeCSVFile(ExecutionContext ec, String fname) {
11421144
try {
11431145
FileFormat fmt = ((MetaDataFormat)mo.getMetaData()).getFileFormat();
11441146
DataCharacteristics dc = (mo.getMetaData()).getDataCharacteristics();
1145-
if( fmt == FileFormat.CSV
1146-
&& !getInput1().getName().startsWith(org.apache.sysds.lops.Data.PREAD_PREFIX) )
1147-
{
1147+
if( fmt == FileFormat.CSV && !mo.isPersistentRead() ) {
11481148
WriterTextCSV writer = new WriterTextCSV((FileFormatPropertiesCSV)fprop);
11491149
writer.addHeaderToCSV(mo.getFileName(), fname, dc.getRows(), dc.getCols());
11501150
}
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.sysds.test.functions.io;
21+
22+
import java.io.File;
23+
import java.io.IOException;
24+
25+
import org.apache.commons.logging.Log;
26+
import org.apache.commons.logging.LogFactory;
27+
import org.apache.sysds.common.Types.ExecMode;
28+
import org.apache.sysds.common.Types.FileFormat;
29+
import org.apache.sysds.common.Types.ValueType;
30+
import org.apache.sysds.runtime.io.MatrixWriter;
31+
import org.apache.sysds.runtime.io.MatrixWriterFactory;
32+
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
33+
import org.apache.sysds.runtime.meta.MatrixCharacteristics;
34+
import org.apache.sysds.runtime.util.DataConverter;
35+
import org.apache.sysds.runtime.util.HDFSTool;
36+
import org.apache.sysds.test.AutomatedTestBase;
37+
import org.apache.sysds.test.TestConfiguration;
38+
import org.apache.sysds.test.TestUtils;
39+
import org.junit.Assert;
40+
import org.junit.Test;
41+
42+
public class RenameIssueTest extends AutomatedTestBase {
43+
44+
protected static final Log LOG = LogFactory.getLog(RenameIssueTest.class.getName());
45+
46+
private final static String TEST_NAME1 = "Rename";
47+
private final static String TEST_DIR = "functions/io/";
48+
private final static String TEST_CLASS_DIR = TEST_DIR + RenameIssueTest.class.getSimpleName() + "/";
49+
50+
@Override
51+
public void setUp() {
52+
TestUtils.clearAssertionInformation();
53+
addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] {"L","R1"}) );
54+
}
55+
56+
@Test
57+
public void testCSVSinglenode() {
58+
runRameTest(FileFormat.CSV, ExecMode.SINGLE_NODE);
59+
}
60+
61+
@Test
62+
public void testCSVHybrid() {
63+
runRameTest(FileFormat.CSV, ExecMode.HYBRID);
64+
}
65+
66+
@Test
67+
public void testCSVSpark() {
68+
runRameTest(FileFormat.CSV, ExecMode.SPARK);
69+
}
70+
71+
@Test
72+
public void testTextSinglenode() {
73+
runRameTest(FileFormat.TEXT, ExecMode.SINGLE_NODE);
74+
}
75+
76+
@Test
77+
public void testTextHybrid() {
78+
runRameTest(FileFormat.TEXT, ExecMode.HYBRID);
79+
}
80+
81+
@Test
82+
public void testTextSpark() {
83+
runRameTest(FileFormat.TEXT, ExecMode.SPARK);
84+
}
85+
86+
@Test
87+
public void testBinarySinglenode() {
88+
runRameTest(FileFormat.BINARY, ExecMode.SINGLE_NODE);
89+
}
90+
91+
@Test
92+
public void testBinaryHybrid() {
93+
runRameTest(FileFormat.BINARY, ExecMode.HYBRID);
94+
}
95+
96+
@Test
97+
public void testBinarySpark() {
98+
runRameTest(FileFormat.BINARY, ExecMode.SPARK);
99+
}
100+
101+
private void runRameTest(FileFormat fmt, ExecMode mode)
102+
{
103+
ExecMode modeOld = setExecMode(mode);
104+
105+
try {
106+
TestConfiguration config = getTestConfiguration(TEST_NAME1);
107+
loadTestConfiguration(config);
108+
109+
MatrixBlock a = DataConverter.convertToMatrixBlock(getRandomMatrix(7, 7, -1, 1, 0.5, -1));
110+
MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(fmt);
111+
writer.writeMatrixToHDFS(a, input("A"),
112+
(long)a.getNumRows(), (long)a.getNumColumns(), (int)a.getNonZeros(), 1000);
113+
HDFSTool.writeMetaDataFile(input("A")+".mtd", ValueType.FP64,
114+
new MatrixCharacteristics(7,7,1000), fmt);
115+
116+
String HOME = SCRIPT_DIR + TEST_DIR;
117+
fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
118+
programArgs = new String[]{"-explain",
119+
"-args", input("A"), fmt.toString().toLowerCase(), output("B")};
120+
runTest(true, false, null, -1);
121+
122+
//check file existence (no rename to output)
123+
Assert.assertTrue(new File(input("A")).exists());
124+
Assert.assertTrue(new File(output("B")).exists());
125+
}
126+
catch (IOException e) {
127+
e.printStackTrace();
128+
Assert.fail();
129+
}
130+
finally {
131+
resetExecMode(modeOld);
132+
}
133+
}
134+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#-------------------------------------------------------------
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
#
20+
#-------------------------------------------------------------
21+
22+
X1 = read($1);
23+
24+
Xa = X1;
25+
for(i in 1:2) {
26+
write(Xa, $3, format=$2);
27+
while(FALSE){} #write first
28+
Xa = rbind(Xa, X1);
29+
print("Creating and writing replicated dataset ["+i+"]");
30+
}
31+

0 commit comments

Comments
 (0)