wip. recurrent encode, lottery.

RobAltena · RobAltena · commit fe1560e7500a · 2019-09-25T17:03:59.000+09:00
Signed-off-by: Robert Altena &lt;Rob@Ra-ai.com&gt;
diff --git a/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/encdec/CorpusIterator.java b/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/encdec/CorpusIterator.java
@@ -1,4 +1,4 @@
-/*******************************************************************************
+/* *****************************************************************************
  * Copyright (c) 2015-2019 Skymind, Inc.
  *
  * This program and the accompanying materials are made available under the
@@ -50,9 +50,8 @@ public class CorpusIterator implements MultiDataSetIterator {
     private int currentMacroBatch = 0;
     private int dictSize;
     private int rowSize;
-    private MultiDataSetPreProcessor preProcessor;
 
-    public CorpusIterator(List<List<Double>> corpus, int batchSize, int batchesPerMacrobatch, int dictSize, int rowSize) {
+    CorpusIterator(List<List<Double>> corpus, int batchSize, int batchesPerMacrobatch, int dictSize, int rowSize) {
         this.corpus = corpus;
         this.batchSize = batchSize;
         this.batchesPerMacrobatch = batchesPerMacrobatch;
@@ -100,8 +99,8 @@ public MultiDataSet next(int num) {
                     Nd4j.ones(rowPred.size()));
             // prediction (output) and decode ARE one-hots though, I couldn't add an embedding layer on top of the decoder and I'm not sure
             // it's a good idea either
-            double predOneHot[][] = new double[dictSize][rowPred.size()];
-            double decodeOneHot[][] = new double[dictSize][rowPred.size()];
+            double[][] predOneHot = new double[dictSize][rowPred.size()];
+            double[][] decodeOneHot = new double[dictSize][rowPred.size()];
             decodeOneHot[2][0] = 1; // <go> token
             int predIdx = 0;
             for (Double pred : rowPred) {
@@ -149,24 +148,23 @@ public int batch() {
         return currentBatch;
     }
 
-    public int totalBatches() {
+    int totalBatches() {
         return totalBatches;
     }
 
-    public void setCurrentBatch(int currentBatch) {
+    void setCurrentBatch(int currentBatch) {
         this.currentBatch = currentBatch;
         currentMacroBatch = getMacroBatchByCurrentBatch();
     }
 
-    public boolean hasNextMacrobatch() {
+    boolean hasNextMacrobatch() {
         return getMacroBatchByCurrentBatch() < totalMacroBatches && currentMacroBatch < totalMacroBatches;
     }
 
-    public void nextMacroBatch() {
+    void nextMacroBatch() {
         ++currentMacroBatch;
     }
 
     public void setPreProcessor(MultiDataSetPreProcessor preProcessor) {
-        this.preProcessor = preProcessor;
     }
 }
diff --git a/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/encdec/CorpusProcessor.java b/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/encdec/CorpusProcessor.java
@@ -1,4 +1,4 @@
-/*******************************************************************************
+/* *****************************************************************************
  * Copyright (c) 2015-2019 Skymind, Inc.
  *
  * This program and the accompanying materials are made available under the
@@ -21,19 +21,19 @@
 import java.util.*;
 
 public class CorpusProcessor {
-    public static final String SPECIALS = "!\"#$;%^:?*()[]{}<>«»,.–—=+…";
+    static final String SPECIALS = "!\"#$;%^:?*()[]{}<>«»,.–—=+…";
     private Set<String> dictSet = new HashSet<>();
     private Map<String, Double> freq = new HashMap<>();
     private Map<String, Double> dict = new HashMap<>();
     private boolean countFreq;
     private InputStream is;
     private int rowSize;
 
-    public CorpusProcessor(String filename, int rowSize, boolean countFreq) throws FileNotFoundException {
+    CorpusProcessor(String filename, int rowSize, boolean countFreq) throws FileNotFoundException {
         this(new FileInputStream(filename), rowSize, countFreq);
     }
 
-    public CorpusProcessor(InputStream is, int rowSize, boolean countFreq) {
+    CorpusProcessor(InputStream is, int rowSize, boolean countFreq) {
         this.is = is;
         this.rowSize = rowSize;
         this.countFreq = countFreq;
@@ -43,33 +43,33 @@ public void start() throws IOException {
         try (BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
             String line;
             String lastName = "";
-            String lastLine = "";
+            StringBuilder lastLine = new StringBuilder();
             while ((line = br.readLine()) != null) {
                 String[] lineSplit = line.toLowerCase().split(" \\+\\+\\+\\$\\+\\+\\+ ", 5);
                 if (lineSplit.length > 4) {
                     // join consecuitive lines from the same speaker
                     if (lineSplit[1].equals(lastName)) {
-                        if (!lastLine.isEmpty()) {
+                        if (lastLine.length() > 0) {
                             // if the previous line doesn't end with a special symbol, append a comma and the current line
                             if (!SPECIALS.contains(lastLine.substring(lastLine.length() - 1))) {
-                                lastLine += ",";
+                                lastLine.append(",");
                             }
-                            lastLine += " " + lineSplit[4];
+                            lastLine.append(" ").append(lineSplit[4]);
                         } else {
-                            lastLine = lineSplit[4];
+                            lastLine = new StringBuilder(lineSplit[4]);
                         }
                     } else {
-                        if (lastLine.isEmpty()) {
-                            lastLine = lineSplit[4];
+                        if (lastLine.length() == 0) {
+                            lastLine = new StringBuilder(lineSplit[4]);
                         } else {
-                            processLine(lastLine);
-                            lastLine = lineSplit[4];
+                            processLine(lastLine.toString());
+                            lastLine = new StringBuilder(lineSplit[4]);
                         }
                         lastName = lineSplit[1];
                     }
                 }
             }
-            processLine(lastLine);
+            processLine(lastLine.toString());
         }
     }
 
@@ -78,7 +78,7 @@ protected void processLine(String lastLine) {
     }
 
     // here we not only split the words but also store punctuation marks
-    protected void tokenizeLine(String lastLine, Collection<String> resultCollection, boolean addSpecials) {
+    void tokenizeLine(String lastLine, Collection<String> resultCollection, boolean addSpecials) {
         String[] words = lastLine.split("[ \t]");
         for (String word : words) {
             if (!word.isEmpty()) {
@@ -122,15 +122,11 @@ private void addWord(Collection<String> coll, String word) {
         }
     }
 
-    public Set<String> getDictSet() {
-        return dictSet;
-    }
-
-    public Map<String, Double> getFreq() {
+    Map<String, Double> getFreq() {
         return freq;
     }
 
-    public void setDict(Map<String, Double> dict) {
+    void setDict(Map<String, Double> dict) {
         this.dict = dict;
     }
 
@@ -142,7 +138,7 @@ public void setDict(Map<String, Double> dict) {
      *            sequence of words
      * @return list of indices.
      */
-    protected final List<Double> wordsToIndexes(final Iterable<String> words) {
+    final List<Double> wordsToIndexes(final Iterable<String> words) {
         int i = rowSize;
         final List<Double> wordIdxs = new LinkedList<>();
         for (final String word : words) {
diff --git a/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/encdec/EncoderDecoderLSTM.java b/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/encdec/EncoderDecoderLSTM.java
@@ -1,4 +1,4 @@
-/*******************************************************************************
+/* *****************************************************************************
  * Copyright (c) 2015-2019 Skymind, Inc.
  *
  * This program and the accompanying materials are made available under the
@@ -168,8 +168,6 @@ public class EncoderDecoderLSTM {
      */
     private final Map<Double, String> revDict = new HashMap<>();
 
-    private final String CHARS = "-\\/_&" + CorpusProcessor.SPECIALS;
-
     /**
      * The contents of the corpus. This is a list of sentences (each word of the
      * sentence is denoted by a {@link java.lang.Double}).
@@ -189,7 +187,6 @@ public class EncoderDecoderLSTM {
                                                // dictionary) are replaced with <unk> token
     private static final int TBPTT_SIZE = 25;
     private static final double LEARNING_RATE = 1e-1;
-    private static final double RMS_DECAY = 0.95;
     private static final int ROW_SIZE = 40; // maximum line length in tokens
 
     /**
@@ -207,10 +204,10 @@ public class EncoderDecoderLSTM {
     private ComputationGraph net;
 
     public static void main(String[] args) throws IOException {
-        new EncoderDecoderLSTM().run(args);
+        new EncoderDecoderLSTM().run();
     }
 
-    private void run(String[] args) throws IOException {
+    private void run() throws IOException {
         Nd4j.getMemoryManager().setAutoGcWindow(GC_WINDOW);
 
         createDictionary();
@@ -227,7 +224,7 @@ private void run(String[] args) throws IOException {
                 if (input.toLowerCase().equals("d")) {
                     startDialog(scanner);
                 } else {
-                    offset = Integer.valueOf(input);
+                    offset = Integer.parseInt(input);
                     test();
                 }
             }
@@ -326,6 +323,7 @@ private void train(File networkFile, int offset) throws IOException {
         }
     }
 
+    @SuppressWarnings("InfiniteLoopStatement")
     private void startDialog(Scanner scanner) throws IOException {
         System.out.println("Dialog started.");
         while (true) {
@@ -385,10 +383,10 @@ private void test() {
     private void output(List<Double> rowIn, boolean printUnknowns) {
         net.rnnClearPreviousState();
         Collections.reverse(rowIn);
-        INDArray in = Nd4j.create(ArrayUtils.toPrimitive(rowIn.toArray(new Double[0])), new int[] { 1, 1, rowIn.size() });
+        INDArray in = Nd4j.create(ArrayUtils.toPrimitive(rowIn.toArray(new Double[0])), 1, 1, rowIn.size());
         double[] decodeArr = new double[dict.size()];
         decodeArr[2] = 1;
-        INDArray decode = Nd4j.create(decodeArr, new int[] { 1, dict.size(), 1 });
+        INDArray decode = Nd4j.create(decodeArr, 1, dict.size(), 1);
         net.feedForward(new INDArray[] { in, decode }, false, false);
         org.deeplearning4j.nn.layers.recurrent.LSTM decoder = (org.deeplearning4j.nn.layers.recurrent.LSTM) net
                 .getLayer("decoder");
@@ -419,19 +417,20 @@ private void output(List<Double> rowIn, boolean printUnknowns) {
             }
             double[] newDecodeArr = new double[dict.size()];
             newDecodeArr[idx] = 1;
-            decode = Nd4j.create(newDecodeArr, new int[] { 1, dict.size(), 1 });
+            decode = Nd4j.create(newDecodeArr, 1, dict.size(), 1);
         }
         System.out.println();
     }
 
-    private void createDictionary() throws IOException, FileNotFoundException {
+    private void createDictionary() throws IOException {
         double idx = 3.0;
         dict.put("<unk>", 0.0);
         revDict.put(0.0, "<unk>");
         dict.put("<eos>", 1.0);
         revDict.put(1.0, "<eos>");
         dict.put("<go>", 2.0);
         revDict.put(2.0, "<go>");
+        String CHARS = "-\\/_&" + CorpusProcessor.SPECIALS;
         for (char c : CHARS.toCharArray()) {
             if (!dict.containsKey(String.valueOf(c))) {
                 dict.put(String.valueOf(c), idx);
@@ -443,7 +442,6 @@ private void createDictionary() throws IOException, FileNotFoundException {
         CorpusProcessor corpusProcessor = new CorpusProcessor(toTempPath(CORPUS_FILENAME), ROW_SIZE, true);
         corpusProcessor.start();
         Map<String, Double> freqs = corpusProcessor.getFreq();
-        Set<String> dictSet = new TreeSet<>(); // the tokens order is preserved for TreeSet
         Map<Double, Set<String>> freqMap = new TreeMap<>(new Comparator<Double>() {
 
             @Override
@@ -452,15 +450,13 @@ public int compare(Double o1, Double o2) {
             }
         }); // tokens of the same frequency fall under the same key, the order is reversed so the most frequent tokens go first
         for (Entry<String, Double> entry : freqs.entrySet()) {
-            Set<String> set = freqMap.get(entry.getValue());
-            if (set == null) {
-                set = new TreeSet<>(); // tokens of the same frequency would be sorted alphabetically
-                freqMap.put(entry.getValue(), set);
-            }
+            Set<String> set = freqMap.computeIfAbsent(entry.getValue(), k -> new TreeSet<>());
+            // tokens of the same frequency would be sorted alphabetically
             set.add(entry.getKey());
         }
         int cnt = 0;
-        dictSet.addAll(dict.keySet());
+        // the tokens order is preserved for TreeSet
+        Set<String> dictSet = new TreeSet<>(dict.keySet());
         // get most frequent tokens and put them to dictSet
         for (Entry<Double, Set<String>> entry : freqMap.entrySet()) {
             for (String val : entry.getValue()) {
diff --git a/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/processlottery/BaseDataSetReader.java b/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/processlottery/BaseDataSetReader.java
@@ -1,4 +1,4 @@
-/*******************************************************************************
+/* *****************************************************************************
  * Copyright (c) 2015-2019 Skymind, Inc.
  *
  * This program and the accompanying materials are made available under the
@@ -19,7 +19,7 @@
 import org.nd4j.linalg.dataset.DataSet;
 
 import java.io.Serializable;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Iterator;
@@ -32,13 +32,13 @@ public abstract class BaseDataSetReader implements Serializable {
 
     protected Iterator<String> iter;
     protected Path filePath;
-    protected int totalExamples;
-    protected int currentCursor;
+    private int totalExamples;
+    int currentCursor;
 
-    public void doInitialize(){
+    void doInitialize(){
         List<String> dataLines;
         try {
-            dataLines = Files.readAllLines(filePath, Charset.forName("UTF-8"));
+            dataLines = Files.readAllLines(filePath, StandardCharsets.UTF_8);
         } catch (Exception e) {
             throw new RuntimeException("loading data failed");
         }
@@ -62,11 +62,7 @@ public List<String> getLabels() {
     public void reset() {
         doInitialize();
     }
-    public int totalExamples() {
+    int totalExamples() {
         return totalExamples;
     }
-    public int cursor() {
-        return currentCursor;
-    }
-
 }
diff --git a/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/processlottery/LotteryCharacterSequenceDataSetReader.java b/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/processlottery/LotteryCharacterSequenceDataSetReader.java
@@ -1,4 +1,4 @@
-/*******************************************************************************
+/* *****************************************************************************
  * Copyright (c) 2015-2019 Skymind, Inc.
  *
  * This program and the accompanying materials are made available under the
@@ -29,7 +29,7 @@
  */
 public class LotteryCharacterSequenceDataSetReader extends BaseDataSetReader {
 
-    public LotteryCharacterSequenceDataSetReader(File file) {
+    LotteryCharacterSequenceDataSetReader(File file) {
         filePath = file.toPath();
         doInitialize();
     }
@@ -39,9 +39,6 @@ public DataSet next(int num) {
         INDArray features = Nd4j.create(new int[]{num, 10, 16}, 'f');
         INDArray labels = Nd4j.create(new int[]{num, 10, 16}, 'f');
 
-
-        INDArray featuresMask = null;
-        INDArray labelsMask = null;
         for (int i =0; i < num && iter.hasNext(); i ++) {
             String featureStr = iter.next();
             currentCursor ++;
@@ -54,7 +51,7 @@ public DataSet next(int num) {
                 labels.putScalar(new int[]{i, label, j}, 1.0);
             }
         }
-        return new DataSet(features, labels, featuresMask, labelsMask);
+        return new DataSet(features, labels, null, null);
     }
 
 }

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-/*******************************************************************************`
	`1`	`+/* *****************************************************************************`
`2`	`2`	`* Copyright (c) 2015-2019 Skymind, Inc.`
`3`	`3`	`*`
`4`	`4`	`* This program and the accompanying materials are made available under the`
`@@ -29,7 +29,7 @@`
`29`	`29`	`*/`
`30`	`30`	`public class LotteryCharacterSequenceDataSetReader extends BaseDataSetReader {`
`31`	`31`
`32`		`- public LotteryCharacterSequenceDataSetReader(File file) {`
	`32`	`+ LotteryCharacterSequenceDataSetReader(File file) {`
`33`	`33`	`filePath = file.toPath();`
`34`	`34`	`doInitialize();`
`35`	`35`	`}`
`@@ -39,9 +39,6 @@ public DataSet next(int num) {`
`39`	`39`	`INDArray features = Nd4j.create(new int[]{num, 10, 16}, 'f');`
`40`	`40`	`INDArray labels = Nd4j.create(new int[]{num, 10, 16}, 'f');`
`41`	`41`
`42`		`-`
`43`		`- INDArray featuresMask = null;`
`44`		`- INDArray labelsMask = null;`
`45`	`42`	`for (int i =0; i < num && iter.hasNext(); i ++) {`
`46`	`43`	`String featureStr = iter.next();`
`47`	`44`	`currentCursor ++;`
`@@ -54,7 +51,7 @@ public DataSet next(int num) {`
`54`	`51`	`labels.putScalar(new int[]{i, label, j}, 1.0);`
`55`	`52`	`}`
`56`	`53`	`}`
`57`		`- return new DataSet(features, labels, featuresMask, labelsMask);`
	`54`	`+ return new DataSet(features, labels, null, null);`
`58`	`55`	`}`
`59`	`56`
`60`	`57`	`}`