adding minimal workflow

Jorge Ejarque · Jorge Ejarque · commit 266ca7cc8654 · 2022-03-17T12:43:47.000+01:00
diff --git a/minimal_workflow/wordcount/README b/minimal_workflow/wordcount/README
@@ -0,0 +1,28 @@
+This is the Readme for:
+WordCount
+
+[Name]: Word Count
+[Contact Person]: support-compss@bsc.es
+[Access Level]: public
+[License Agreement]: Apache2
+[Platform]: COMPSs
+
+[Body]
+== Description ==
+Wordcount is an application that counts the number of words for a given set of files
+To allow parallelism every file is treated separately and merged afterwards.
+
+Wordcount_block is an application that counts the number of words for a given file. It allow parallelism for a block of a file.
+
+== Execution instructions ==
+Usage:
+runcompss src/wordcount.py <datasetPath>
+
+or 
+
+runcompss src/wordcount_blocks.py <datasetPath> <output_file_path> <block_size>
+
+where:
+        * - datasetPath: Absolute path of the file to parse (e.g. /home/compss/tutorial_apps/python/wordcount/data/)
+runcompss 
+
diff --git a/minimal_workflow/wordcount/spack.yaml b/minimal_workflow/wordcount/spack.yaml
@@ -0,0 +1,7 @@
+spack:
+        specs:
+                - compss
+        concretization: together
+        config:
+                 install_tree: /opt/software
+        view: /opt/view
diff --git a/minimal_workflow/wordcount/src/wordcount.py b/minimal_workflow/wordcount/src/wordcount.py
@@ -0,0 +1,106 @@
+#!/usr/bin/python
+#
+#  Copyright 2002-2019 Barcelona Supercomputing Center (www.bsc.es)
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+import time
+
+from pycompss.api.task import task
+from pycompss.api.parameter import *
+
+
+@task(file_path=FILE_IN, returns=list)
+def read_file(file_path):
+    """ Read a file and return a list of words.
+    :param file_path: file's path
+    :return: list of words
+    """
+    data = []
+    with open(file_path, 'r') as f:
+        for line in f:
+            data += line.split()
+    return data
+
+
+@task(returns=dict)
+def wordCount(data):
+    """ Construct a frequency word dictorionary from a list of words.
+    :param data: a list of words
+    :return: a dictionary where key=word and value=#appearances
+    """
+    partialResult = {}
+    for entry in data:
+        if entry in partialResult:
+            partialResult[entry] += 1
+        else:
+            partialResult[entry] = 1
+    return partialResult
+
+
+@task(returns=dict, priority=True)
+def merge_two_dicts(dic1, dic2):
+    """ Update a dictionary with another dictionary.
+    :param dic1: first dictionary
+    :param dic2: second dictionary
+    :return: dic1+=dic2
+    """
+    for k in dic2:
+        if k in dic1:
+            dic1[k] += dic2[k]
+        else:
+            dic1[k] = dic2[k]
+    return dic1
+
+
+if __name__ == "__main__":
+    from pycompss.api.api import compss_wait_on
+
+    # Start counting time...
+    start_time = time.time()
+
+    # Get the dataset path
+    pathDataset = sys.argv[1]
+
+    # Construct a list with the file's paths from the dataset
+    paths = []
+    for fileName in os.listdir(pathDataset):
+        paths.append(os.path.join(pathDataset, fileName))
+
+    # Read file's content execute a wordcount on each of them
+    partialResult = []
+    for p in paths:
+        data = read_file(p)
+        partialResult.append(wordCount(data))
+
+    # Accumulate the partial results to get the final result.
+    result = {}
+    for partial in partialResult:
+        result = merge_two_dicts(result, partial)
+
+    # Wait for result
+    result = compss_wait_on(result)
+
+    elapsed_time = time.time() - start_time
+
+    # Print the results and elapsed time
+    print("Word appearances:")
+    from pprint import pprint
+    pprint(result)
+    print("Elapsed Time (s): " + str(elapsed_time))
+    print("Words: " + str(sum(result.values())))
diff --git a/minimal_workflow/wordcount/src/wordcount_blocks.py b/minimal_workflow/wordcount/src/wordcount_blocks.py
@@ -0,0 +1,86 @@
+#!/usr/bin/python
+#
+#  Copyright 2002-2019 Barcelona Supercomputing Center (www.bsc.es)
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# -*- coding: utf-8 -*-
+
+'''Wordcount Block divide'''
+import sys
+import os
+import pickle
+import time
+from pycompss.api.task import task
+from pycompss.api.parameter import *
+from pycompss.api.api import compss_wait_on
+
+
+def read_word(file_object):
+    for line in file_object:
+        for word in line.split():
+            yield word
+
+
+def read_word_by_word(fp, sizeBlock):
+    """Lazy function (generator) to read a file piece by piece in
+    chunks of size approx sizeBlock"""
+    data = open(fp)
+    block = []
+    for word in read_word(data):
+        block.append(word)
+        if sys.getsizeof(block) > sizeBlock:
+            yield block
+            block = []
+    if block:
+        yield block
+
+
+@task(returns=dict)
+def wordCount(data):
+    partialResult = {}
+    for entry in data:
+        if entry not in partialResult:
+            partialResult[entry] = 1
+        else:
+            partialResult[entry] += 1
+    return partialResult
+
+
+@task(dic1=INOUT)
+def merge_two_dicts(dic1, dic2):
+    for k in dic2:
+        if k in dic1:
+            dic1[k] += dic2[k]
+        else:
+            dic1[k] = dic2[k]
+
+if __name__ == "__main__":
+    pathFile = sys.argv[1]
+    resultFile = sys.argv[2]
+    sizeBlock = int(sys.argv[3])
+
+    start = time.time()
+    result = {}
+    for block in read_word_by_word(pathFile, sizeBlock):
+        presult = wordCount(block)
+        merge_two_dicts(result, presult)
+    result = compss_wait_on(result)
+
+    elapsed = time.time() - start
+    print("Elapsed Time: " + str(elapsed))
+
+    ff = open(resultFile, 'w')
+    ff.write(str(result), ff)
+    ff.close()