CatalystCode
diff --git a/‎.gitignore
Lines changed: 1 addition & 1 deletion b/‎.gitignore
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 25 additions & 17 deletions b/‎README.md
Lines changed: 25 additions & 17 deletions
diff --git a/‎app/api/tests/data/example.invalid.1.xlsx
9.61 KB b/‎app/api/tests/data/example.invalid.1.xlsx
9.61 KB
diff --git a/‎app/api/tests/data/example.invalid.2.xlsx
9.65 KB b/‎app/api/tests/data/example.invalid.2.xlsx
9.65 KB
diff --git a/‎app/api/tests/data/example.xlsx
9.61 KB b/‎app/api/tests/data/example.xlsx
9.61 KB
diff --git a/‎app/api/tests/data/example_one_column.csv
Lines changed: 4 additions & 0 deletions b/‎app/api/tests/data/example_one_column.csv
Lines changed: 4 additions & 0 deletions
diff --git a/‎app/api/tests/data/example_one_column.xlsx
9.58 KB b/‎app/api/tests/data/example_one_column.xlsx
9.58 KB
diff --git a/‎app/api/tests/data/example_one_column_no_header.xlsx
9.7 KB b/‎app/api/tests/data/example_one_column_no_header.xlsx
9.7 KB
diff --git a/‎app/api/tests/test_api.py
Lines changed: 44 additions & 1 deletion b/‎app/api/tests/test_api.py
Lines changed: 44 additions & 1 deletion
diff --git a/‎app/api/utils.py
Lines changed: 21 additions & 2 deletions b/‎app/api/utils.py
Lines changed: 21 additions & 2 deletions
@@ -199,4 +199,4 @@ node_modules/
 bundle/
 webpack-stats.json
 
-.vscode/
+.vscode
@@ -58,20 +58,19 @@ Doccano can be deployed to AWS ([Cloudformation](https://docs.aws.amazon.com/AWS
 
 > Notice: (1) EC2 KeyPair cannot be created automatically, so make sure you have an existing EC2 KeyPair in one region. Or [create one yourself](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html#having-ec2-create-your-key-pair). (2) If you want to access doccano via HTTPS in AWS, here is an [instruction](https://github.com/chakki-works/doccano/wiki/HTTPS-setting-for-doccano-in-AWS).
 
-
 ## Features
 
-* Collaborative annotation
-* Multi-Language support
-* Emoji :smile: support
-* (future) Auto labeling
+-   Collaborative annotation
+-   Multi-Language support
+-   Emoji :smile: support
+-   (future) Auto labeling
 
 ## Requirements
 
-* Python 3.6+
-* Django 2.1.7+
-* Node.js 8.0+
-* Google Chrome(highly recommended)
+-   Python 3.6+
+-   Django 2.1.7+
+-   Node.js 8.0+
+-   Google Chrome(highly recommended)
 
 ## Installation
 
@@ -162,7 +161,9 @@ Finally, to start the server, run the following command:
 ```bash
 python manage.py runserver
 ```
+
 Optionally, you can change the bind ip and port using the command
+
 ```bash
 python manage.py runserver <ip>:<port>
 ```
@@ -197,28 +198,34 @@ After creating a project, you will see the "Import Data" page, or click `Import
 
 <img src="./docs/upload.png" alt="Upload project" width=600>
 
-You can upload two types of files:
-- `CSV file`: file must contain a header with a `text` column or be one-column csv file.
-- `JSON file`: each line contains a JSON object with a `text` key. JSON format supports line breaks rendering.
+You can upload the following types of files (depending on project type):
+
+-   `Text file`: file must contain one sentence/document per line separated by new lines.
+-   `CSV file`: file must contain a header with `"text"` as the first column or be one-column csv file. If using labels the sencond column must be the labels.
+-   `Excel file`: file must contain a header with `"text"` as the first column or be one-column excel file. If using labels the sencond column must be the labels. Supports multiple sheets as long as format is the same.
+-   `JSON file`: each line contains a JSON object with a `text` key. JSON format supports line breaks rendering.
 
 > Notice: Doccano won't render line breaks in annotation page for sequence labeling task due to the indent problem, but the exported JSON file still contains line breaks.
 
-`example.txt` (or `example.csv`)
-```python
+`example.txt/csv/xlsx`
+
+```txt
 EU rejects German call to boycott British lamb.
 President Obama is speaking at the White House.
 He lives in Newark, Ohio.
 ...
 ```
+
 `example.json`
+
 ```JSON
 {"text": "EU rejects German call to boycott British lamb."}
 {"text": "President Obama is speaking at the White House."}
 {"text": "He lives in Newark, Ohio."}
 ...
 ```
 
-Any other columns (for csv) or keys (for json) are preserved and will be exported in the `metadata` column or key as is.
+Any other columns (for csv/excel) or keys (for json) are preserved and will be exported in the `metadata` column or key as is.
 
 Once you select a TXT/JSON file on your computer, click `Upload dataset` button. After uploading the dataset file, we will see the `Dataset` page (or click `Dataset` button list in the left bar). This page displays all the documents we uploaded in one project.
 
@@ -228,7 +235,6 @@ Click `Labels` button in left bar to define your own labels. You should see the
 
 <img src="./docs/label_editor.png" alt="Edit label" width=600>
 
-
 ### Annotation
 
 Now, you are ready to annotate the texts. Just click the `Annotate Data` button in the navigation bar, you can start to annotate the documents you uploaded.
@@ -249,11 +255,14 @@ by adding `external_id` to the imported file. For example:
 
 Input file may look like this:
 `import.json`
+
 ```JSON
 {"text": "EU rejects German call to boycott British lamb.", "meta": {"external_id": 1}}
 ```
+
 and the exported file will look like this:
 `output.json`
+
 ```JSON
 {"doc_id": 2023, "text": "EU rejects German call to boycott British lamb.", "labels": ["news"], "username": "root", "meta": {"external_id": 1}}
 ```
@@ -270,7 +279,6 @@ As with any software, doccano is under continuous development. If you have reque
 
 Here are some tips might be helpful. [How to Contribute to Doccano Project](https://github.com/chakki-works/doccano/wiki/How-to-Contribute-to-Doccano-Project)
 
-
 ## Contact
 
 For help and feedback, please feel free to contact [the author](https://github.com/Hironsan).
 
@@ -0,0 +1,4 @@
+text
+AAA
+BBB
+CCC
@@ -759,7 +759,7 @@ def setUp(self):
     def upload_test_helper(self, project_id, filename, file_format, expected_status, **kwargs):
         url = reverse(viewname='doc_uploader', args=[project_id])
 
-        with open(os.path.join(DATA_DIR, filename)) as f:
+        with open(os.path.join(DATA_DIR, filename), 'rb') as f:
             response = self.client.post(url, data={'file': f, 'format': file_format})
 
         self.assertEqual(response.status_code, expected_status)
@@ -803,6 +803,12 @@ def test_can_upload_seq2seq_csv(self):
                                 file_format='csv',
                                 expected_status=status.HTTP_201_CREATED)
 
+    def test_can_upload_single_column_csv(self):
+        self.upload_test_helper(project_id=self.seq2seq_project.id,
+                                filename='example_one_column.csv',
+                                file_format='csv',
+                                expected_status=status.HTTP_201_CREATED)
+
     def test_cannot_upload_csv_file_does_not_match_column_and_row(self):
         self.upload_test_helper(project_id=self.classification_project.id,
                                 filename='example.invalid.1.csv',
@@ -815,6 +821,43 @@ def test_cannot_upload_csv_file_has_too_many_columns(self):
                                 file_format='csv',
                                 expected_status=status.HTTP_400_BAD_REQUEST)
 
+    def test_can_upload_classification_excel(self):
+        self.upload_test_helper(project_id=self.classification_project.id,
+                                filename='example.xlsx',
+                                file_format='excel',
+                                expected_status=status.HTTP_201_CREATED)
+
+    def test_can_upload_seq2seq_excel(self):
+        self.upload_test_helper(project_id=self.seq2seq_project.id,
+                                filename='example.xlsx',
+                                file_format='excel',
+                                expected_status=status.HTTP_201_CREATED)
+
+    def test_can_upload_single_column_excel(self):
+        self.upload_test_helper(project_id=self.seq2seq_project.id,
+                                filename='example_one_column.xlsx',
+                                file_format='excel',
+                                expected_status=status.HTTP_201_CREATED)
+
+    def test_cannot_upload_excel_file_does_not_match_column_and_row(self):
+        self.upload_test_helper(project_id=self.classification_project.id,
+                                filename='example.invalid.1.xlsx',
+                                file_format='excel',
+                                expected_status=status.HTTP_400_BAD_REQUEST)
+
+    def test_cannot_upload_excel_file_has_too_many_columns(self):
+        self.upload_test_helper(project_id=self.classification_project.id,
+                                filename='example.invalid.2.xlsx',
+                                file_format='excel',
+                                expected_status=status.HTTP_400_BAD_REQUEST)
+
+    @override_settings(IMPORT_BATCH_SIZE=1)
+    def test_can_upload_small_batch_size(self):
+        self.upload_test_helper(project_id=self.seq2seq_project.id,
+                                filename='example_one_column_no_header.xlsx',
+                                file_format='excel',
+                                expected_status=status.HTTP_201_CREATED)
+
     def test_can_upload_classification_jsonl(self):
         self.upload_test_helper(project_id=self.classification_project.id,
                                 filename='classification.jsonl',
 
@@ -9,6 +9,7 @@
 import conllu
 from django.db import transaction
 from django.conf import settings
+import pyexcel
 from rest_framework.renderers import JSONRenderer
 from seqeval.metrics.sequence_labeling import get_entities
 
@@ -324,13 +325,32 @@ class CSVParser(FileParser):
     def parse(self, file):
         file = io.TextIOWrapper(file, encoding='utf-8')
         reader = csv.reader(file)
+        yield from ExcelParser.parse_excel_csv_reader(reader)
+
+
+class ExcelParser(FileParser):
+    def parse(self, file):
+        excel_book = pyexcel.iget_book(file_type="xlsx", file_content=file.read())
+        # Handle multiple sheets
+        for sheet_name in excel_book.sheet_names():
+            reader = excel_book[sheet_name].to_array()
+            yield from self.parse_excel_csv_reader(reader)
+
+    @staticmethod
+    def parse_excel_csv_reader(reader):
         columns = next(reader)
         data = []
+        if len(columns) == 1 and columns[0] != 'text':
+            data.append({'text': columns[0]})
         for i, row in enumerate(reader, start=2):
             if len(data) >= settings.IMPORT_BATCH_SIZE:
                 yield data
                 data = []
-            if len(row) == len(columns) and len(row) >= 2:
+            # Only text column
+            if len(row) == len(columns) and len(row) == 1:
+                data.append({'text': row[0]})
+            # Text, labels and metadata columns
+            elif len(row) == len(columns) and len(row) >= 2:
                 text, label = row[:2]
                 meta = json.dumps(dict(zip(columns[2:], row[2:])))
                 j = {'text': text, 'labels': [label], 'meta': meta}
@@ -352,7 +372,6 @@ def parse(self, file):
                 data = []
             try:
                 j = json.loads(line)
-                #j  = json.loads(line.decode('utf-8'))
                 j['meta'] = json.dumps(j.get('meta', {}))
                 data.append(j)
             except json.decoder.JSONDecodeError:
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +text
 +AAA
 +BBB
 +CCC