Added dumper option that outputs to an Excel file

samarlyka · samarlyka · commit 2f6d91744b88 · 2025-01-30T17:05:00.000+07:00
diff --git a/README.md b/README.md
@@ -4,9 +4,17 @@ Python utility package for scraping information on SINTA (Science and Technology
 
 ## A. Documentation
 
-### A.1. Author Verification
+### A.1. Installation
 
-#### A.1.i. Authentication
+You can install `sintautils` using PIP as follows:
+
+```sh
+pip install sintautils
+```
+
+### A.2. Author Verification
+
+#### A.2.i. Authentication
 
 Author verification menu is a restricted menu of SINTA. You must be registered as a university administrator and obtain an admin credential in order to use this function. An author verification (AV) admin's credential consists of an email-based username and a password.
 
@@ -25,6 +33,38 @@ from sintautils import AV
 scraper = AV('admin@university.edu', 'password1234', autologin=True)
 ```
 
+#### A.2.ii. Basic Usage
+
+After importing the modules and initializing the `AV` class, you can start dumping research information of a given author in SINTA using the `dump_author()` method. The following code dumps all research data pertaining to a SINTA author and saves the result to an Excel file named `sintautils_dump_author-1234.xlsx` under the current working directory. Each data category (IPR, book, Google Scholar publication, etc.) is represented by a separate Excel sheet.
+
+```python
+# Change "1234" to the respective author's SINTA ID.
+scraper.dump_author('1234')
+```
+
+You can customize which data type to scrape by specifying the `fields` parameter:
+
+```python
+# Possible values for the "fields" parameter:
+# book, garuda, gscholar, ipr, research, scopus, service, wos
+# Use asterisks "*" (the default) in order to scrape all information.
+scraper.dump_author('1234', fields='book garuda wos')
+```
+
+Also, you can change the output format, save directory, and filename prefix as follows:
+
+```python
+# Possible values for the "out_format" parameter:
+# csv, json, json-pretty, xlsx
+scraper.dump_author('1234',
+    out_format='json-pretty',
+    out_folder='/path/to/save/directory',
+    out_prefix='filename_prefix-'
+)
+```
+
+If multiple fields are specified when using `out_format=csv`, each data type will be saved as a separate CSV file under the same `out_folder` directory.
+
 ## B. To-Do
 
 ### B.1. New Features
@@ -34,7 +74,7 @@ scraper = AV('admin@university.edu', 'password1234', autologin=True)
 - [X] Add scraper for IPR and book of each author.
 - [X] Add garuda scraper per author.
 - [X] Add author info dumper.
-- [ ] Add author info dumper using `openpyxl` implementation that outputs to an Excel/spreadsheet workbook file.
+- [X] Add author info dumper using `openpyxl` implementation that outputs to an Excel/spreadsheet workbook file.
 
 ### B.2. Bug Fixes
 
diff --git a/src/sintautils/core.py b/src/sintautils/core.py
@@ -29,6 +29,7 @@
 import requests as rq
 import time
 
+from .exceptions import EmptyFieldException
 from .exceptions import InvalidLoginCredentialException
 from .exceptions import InvalidParameterException
 from .exceptions import NoLoginCredentialsException
@@ -197,6 +198,10 @@ def dump_author(
         - ["*"]
         """
 
+        # Validating the fields.
+        if fields.__len__() < 1:
+            raise EmptyFieldException('book, garuda, gscholar, ipr, research, scopus, service, wos')
+
         # Validating the output format.
         if type(out_format) is not str or out_format not in ['csv', 'json', 'xlsx']:
             raise InvalidParameterException('"out_format" must be one of "csv", "json", and "xlsx"')
@@ -244,8 +249,36 @@ def dump(dump_id):
                         json.dump(b, fo)
 
             elif out_format == 'xlsx':
-                # TODO: Work on the implementation of xlsx dumper using openpyxl.
-                pass
+                wb = Workbook()
+                for m in sorted(a.keys()):
+                    ws = wb.create_sheet(m, -1)
+
+                    # Obtaining the data list and validate the data length.
+                    b: list = a[m]
+                    if b.__len__() < 1:
+                        continue
+
+                    # Write the spreadsheet header.
+                    headers: list = list(b[0].keys())
+                    for i in range(len(headers)):
+                        n = headers[i]
+                        ws.cell(row=1, column=(i + 1), value=n)
+
+                        # Write the column's content.
+                        for j in range(len(b)):
+                            c: dict = b[j]
+                            # Offset the row number by two, because the first row is header.
+                            ws.cell(row=(j + 2), column=(i + 1), value=c[n])
+
+                # Remove sheets that do not represent data type.
+                if wb.sheetnames.__len__() > 0:
+                    for d in wb.sheetnames:
+                        if d not in a.keys():
+                            wb.remove(wb[d])
+
+                # Saving the spreadsheet.
+                save_file = str(out_folder) + os.sep + str(out_prefix) + str(dump_id) + '.xlsx'
+                wb.save(save_file)
 
         if type(author_id) is str:
             dump(dump_id=author_id)
diff --git a/src/sintautils/exceptions.py b/src/sintautils/exceptions.py
@@ -39,6 +39,18 @@ def __repr__(self):
     __str__ = __repr__
 
 
+class EmptyFieldException(SintaException):
+    """ Error raised when there is no field selection in the scraper function passed. """
+
+    def __init__(self, arg: str = ''):
+        self.arg = arg
+
+    def __repr__(self):
+        return f'You must specify at least one of the following fields: {self.arg}. Use "*" to select all fields.'
+
+    __str__ = __repr__
+
+
 class InvalidAuthorIDException(SintaException):
     """ Error raised when the user specifies an invalid (i.e., non-numerical) author ID. """