simple pipe reader for hdfs or other service

typhoonzero · typhoonzero · commit aeeb77de1d40 · 2017-11-01T20:53:43.000+08:00
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
@@ -323,3 +323,101 @@ def xreader():
                 yield sample
 
     return xreader
+
+
+def _buf2lines(buf, line_break="\n"):
+    # FIXME: line_break should be automatically configured.
+    lines = buf.split(line_break)
+    return lines[:-1], lines[-1]
+
+
+def pipe_reader(left_cmd,
+                parser,
+                bufsize=8192,
+                file_type="plain",
+                cut_lines=True,
+                line_break="\n"):
+    """
+    pipe_reader read data by stream from a command, take it's 
+    stdout into a pipe buffer and redirect it to the parser to
+    parse, then yield data as your desired format.
+
+    You can using standard linux command or call another program
+    to read data, from HDFS, Ceph, URL, AWS S3 etc:
+
+    cmd = "hadoop fs -cat /path/to/some/file"
+    cmd = "cat sample_file.tar.gz"
+    cmd = "curl http://someurl"
+    cmd = "python print_s3_bucket.py"
+
+    A sample parser:
+    
+    def sample_parser(lines):
+        # parse each line as one sample data,
+        # return a list of samples as batches.
+        ret = []
+        for l in lines:
+            ret.append(l.split(" ")[1:5])
+        return ret
+
+    :param left_cmd: command to excute to get stdout from.
+    :type left_cmd: string
+    :param parser: parser function to parse lines of data.
+                   if cut_lines is True, parser will receive list
+                   of lines.
+                   if cut_lines is False, parser will receive a
+                   raw buffer each time.
+                   parser should return a list of parsed values.
+    :type parser: callable
+    :param bufsize: the buffer size used for the stdout pipe.
+    :type bufsize: int
+    :param file_type: can be plain/gzip, stream buffer data type.
+    :type file_type: string
+    :param cut_lines: whether to pass lines instead of raw buffer
+                      to the parser
+    :type cut_lines: bool
+    :param line_break: line break of the file, like \n or \r
+    :type line_break: string
+
+    :return: the reader generator.
+    :rtype: callable
+    """
+    if not isinstance(left_cmd, str):
+        raise TypeError("left_cmd must be a string")
+    if not callable(parser):
+        raise TypeError("parser must be a callable object")
+
+    process = subprocess.Popen(
+        left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+    # TODO(typhoonzero): add a thread to read stderr
+
+    # Always init a decompress object is better than
+    # create in the loop.
+    dec = zlib.decompressobj(
+        32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+
+    def reader():
+        remained = ""
+        while True:
+            buff = process.stdout.read(bufsize)
+            if buff:
+                if file_type == "gzip":
+                    decomp_buff = dec.decompress(buff)
+                elif file_type == "plain":
+                    decomp_buff = buff
+                else:
+                    raise TypeError("file_type %s is not allowed" % file_type)
+
+                if cut_lines:
+                    lines, remained = _buf2lines(''.join(
+                        [remained, decomp_buff]), line_break)
+                    parsed_list = parser(lines)
+                    for ret in parsed_list:
+                        yield ret
+                else:
+                    for ret in parser(decomp_buff):
+                        yield ret
+            else:
+                break
+
+    return reader