Unstructured-IO
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 8 additions & 1 deletion b/‎Makefile‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎docs/requirements.txt‎
Lines changed: 2 additions & 2 deletions b/‎docs/requirements.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/bricks.rst‎
Lines changed: 85 additions & 0 deletions b/‎docs/source/bricks.rst‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎docs/source/installing.rst‎
Lines changed: 19 additions & 0 deletions b/‎docs/source/installing.rst‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎requirements/build.txt‎
Lines changed: 6 additions & 2 deletions b/‎requirements/build.txt‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎requirements/dev.txt‎
Lines changed: 17 additions & 3 deletions b/‎requirements/dev.txt‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎requirements/huggingface.txt‎
Lines changed: 59 additions & 0 deletions b/‎requirements/huggingface.txt‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎requirements/pdf.txt‎
Lines changed: 12 additions & 1 deletion b/‎requirements/pdf.txt‎
Lines changed: 12 additions & 1 deletion
@@ -1,5 +1,6 @@
-## 0.2.1-dev6
+## 0.2.1-dev7
 
+* Added staging brick for separating text into attention window size chunks for `transformers`.
 * Added staging brick for LabelBox.
 * Added ability to upload LabelStudio predictions
 * Added utility function for JSONL reading and writing
 
@@ -20,13 +20,18 @@ install-base: install-base-pip-packages install-nltk-models
 install: install-base-pip-packages install-dev install-detectron2 install-nltk-models install-test
 
 .PHONY: install-ci
-install-ci: install-base-pip-packages install-pdf install-test install-nltk-models
+install-ci: install-base-pip-packages install-pdf install-test install-nltk-models install-huggingface
 
 .PHONY: install-base-pip-packages
 install-base-pip-packages:
 	python3 -m pip install pip==${PIP_VERSION}
 	pip install -r requirements/base.txt
 
+.PHONY: install-huggingface
+install-huggingface:
+	python3 -m pip install pip==${PIP_VERSION}
+	pip install -r requirements/huggingface.txt
+
 .PHONY: install-pdf
 install-pdf:
 	python3 -m pip install pip==${PIP_VERSION}
@@ -60,6 +65,8 @@ install-build:
 .PHONY: pip-compile
 pip-compile:
 	pip-compile -o requirements/base.txt
+	# Extra requirements for huggingface staging functions
+	pip-compile --extra huggingface -o requirements/huggingface.txt
 	# Extra requirements for parsing PDF files
 	pip-compile --extra pdf -o requirements/pdf.txt
 	# NOTE(robinson) - We want the dependencies for detectron2 in the requirements.txt, but not
 
@@ -32,7 +32,7 @@ pygments==2.13.0
     # via sphinx
 pyparsing==3.0.9
     # via packaging
-pytz==2022.2.1
+pytz==2022.4
     # via babel
 requests==2.28.1
     # via sphinx
@@ -58,5 +58,5 @@ sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
 urllib3==1.26.12
     # via requests
-zipp==3.8.1
+zipp==3.9.0
     # via importlib-metadata
@@ -338,6 +338,91 @@ Examples:
   isd = convert_to_isd(elements)
 
 
+``stage_for_transformers``
+--------------------------
+
+Prepares ``Text`` elements for processing in ``transformers`` pipelines
+by splitting the elements into chunks that fit into the model's attention window. 
+
+Examples:
+
+.. code:: python
+
+    from transformers import AutoTokenizer, AutoModelForTokenClassification
+    from transformers import pipeline
+
+    from unstructured.documents.elements import NarrativeText
+    from unstructured.staging.huggingface import stage_for_transformers
+
+    model_name = "hf-internal-testing/tiny-bert-for-token-classification"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForTokenClassification.from_pretrained(model_name)
+
+    nlp = pipeline("ner", model=model, tokenizer=tokenizer)
+
+    text = """From frost advisories this morning to a strong cold front expected later this week, the chance of fall showing up is real.
+
+    There's a refreshing crispness to the air, and it looks to get only more pronounced as the week goes on.
+
+    Frost advisories were in place this morning across portions of the Appalachians and coastal Maine as temperatures dropped into the 30s.
+
+    Temperatures this morning were in the 40s as far south as the Florida Panhandle.
+
+    And Maine even had a few reports of their first snow of the season Sunday. More cities could see their first snow later this week.
+
+    Yes, hello fall!
+
+    As temperatures moderate during the next few days, much of the east will stay right around seasonal norms, but the next blast of cold air will be strong and come with the potential for hazardous conditions.
+
+    "A more active fall weather pattern is expected to evolve by the end of this week and continuing into the weekend as a couple of cold fronts move across the central and eastern states," the Weather Prediction Center said.
+
+    The potent cold front will come in from Canada with a punch of chilly air, heavy rain and strong wind.
+
+    The Weather Prediction Center has a slight risk of excessive rainfall for much of the Northeast and New England on Thursday, including places like New York City, Buffalo and Burlington, so we will have to look out for flash flooding in these areas.
+
+    "More impactful weather continues to look likely with confidence growing that our region will experience the first real fall-like system with gusty to strong winds and a period of moderate to heavy rain along and ahead of a cold front passage," the National Weather Service office in Burlington wrote.
+
+    The potential for very heavy rain could accompany the front, bringing up to two inches of rain for much of the area, and isolated locations could see even more.
+
+    "Ensembles [forecast models] show median rainfall totals by Wednesday night around a half inch, with a potential for some spots to see around one inch, our first substantial rainfall in at least a couple of weeks," the weather service office in Grand Rapids noted, adding, "It may also get cold enough for some snow to mix in Thursday night to Friday morning, especially in the higher terrain north of Grand Rapids toward Cadillac."
+
+    There is also a chance for very strong winds to accompany the system.
+
+    The weather service is forecasting winds of 30-40 mph ahead of the cold front, which could cause some tree limbs to fall and sporadic power outages.
+
+    Behind the front, temperatures will fall.
+
+    "East Coast, with highs about 5-15 degrees below average to close out the workweek and going into next weekend, with highs only in the 40s and 50s from the Great Lakes to the Northeast on most days," the Weather Prediction Center explained.
+
+    By the weekend, a second cold front will drop down from Canada and bring a reinforcing shot of chilly air across the eastern half of the country."""
+
+    chunks = stage_for_transformers([NarrativeText(text=text)], tokenizer)
+
+    results = [nlp(chunk) for chunk in chunks]
+
+
+The following optional keyword arguments can be specified in
+``stage_for_transformers``:
+
+    * ``buffer``: Indicates the number of tokens to leave as a buffer for the attention window. This is to account for special tokens like ``[CLS]`` that can appear at the beginning or end of an input sequence.
+    * ``max_input_size``: The size of the attention window for the model. If not specified, the default is the ``model_max_length`` attribute on the tokenizer object.
+    * ``split_function``: The function used to split the text into chunks to consider for adding to the attention window. Splits on spaces be default.
+    * ``chunk_separator``: The string used to concat adjacent chunks when reconstructing the text. Uses spaces by default.
+
+  If you need to operate on text directly instead of ``unstructured`` ``Text``
+  objects, use the ``chunk_by_attention_window`` helper function. Simply modify
+  the example above to include the following:
+
+  .. code:: python
+
+    from unstructured.staging.huggingface import chunk_by_attention_window
+
+    chunks = chunk_by_attention_window(text, tokenizer)
+
+    results = [nlp(chunk) for chunk in chunks]
+
+
+
 ``stage_for_label_studio``
 --------------------------
 
 
@@ -52,3 +52,22 @@ Also ensure that you have ``poppler`` installed on your system. On a Mac, you ca
 .. code:: console
 
 		$ brew install poppler
+
+
+========================
+Huggingface Dependencies
+========================
+
+The ``transformers`` requires the Rust compiler to be present on your system in
+order to properly ``pip`` install. If a Rust compiler is not available on your system,
+you can run the following command to install it:
+
+.. code:: console
+
+    $ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+
+Additionally, some tokenizers in the ``transformers`` library required the ``sentencepiece``
+library. This is not included as an ``unstructured`` dependency because it only applies
+to some tokenizers. See the
+`sentencepiece install instructions <https://github.com/google/sentencepiece#installation>`_ for
+information on how to install ``sentencepiece`` if your tokenizer requires it.
@@ -20,6 +20,8 @@ idna==3.4
     # via requests
 imagesize==1.4.1
     # via sphinx
+importlib-metadata==5.0.0
+    # via sphinx
 jinja2==3.1.2
     # via sphinx
 markupsafe==2.1.1
@@ -38,10 +40,10 @@ snowballstemmer==2.2.0
     # via sphinx
 sphinx==5.2.3
     # via
-    #   -r build.in
+    #   -r requirements/build.in
     #   sphinx-rtd-theme
 sphinx-rtd-theme==1.0.0
-    # via -r build.in
+    # via -r requirements/build.in
 sphinxcontrib-applehelp==1.0.2
     # via sphinx
 sphinxcontrib-devhelp==1.0.2
@@ -56,3 +58,5 @@ sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
 urllib3==1.26.12
     # via requests
+zipp==3.9.0
+    # via importlib-metadata
@@ -4,6 +4,10 @@
 #
 #    pip-compile requirements/dev.in
 #
+appnope==0.1.3
+    # via
+    #   ipykernel
+    #   ipython
 argon2-cffi==21.3.0
     # via notebook
 argon2-cffi-bindings==21.2.0
@@ -36,6 +40,10 @@ executing==1.0.0
     # via stack-data
 fastjsonschema==2.16.2
     # via nbformat
+importlib-metadata==5.0.0
+    # via nbconvert
+importlib-resources==5.10.0
+    # via jsonschema
 ipykernel==6.15.3
     # via
     #   ipywidgets
@@ -45,7 +53,7 @@ ipykernel==6.15.3
     #   qtconsole
 ipython==8.5.0
     # via
-    #   -r dev.in
+    #   -r requirements/dev.in
     #   ipykernel
     #   ipywidgets
     #   jupyter-console
@@ -64,7 +72,7 @@ jinja2==3.1.2
 jsonschema==4.16.0
     # via nbformat
 jupyter==1.0.0
-    # via -r dev.in
+    # via -r requirements/dev.in
 jupyter-client==7.3.5
     # via
     #   ipykernel
@@ -133,7 +141,9 @@ pexpect==4.8.0
 pickleshare==0.7.5
     # via ipython
 pip-tools==6.9.0
-    # via -r dev.in
+    # via -r requirements/dev.in
+pkgutil-resolve-name==1.3.10
+    # via jsonschema
 prometheus-client==0.14.1
     # via notebook
 prompt-toolkit==3.0.31
@@ -220,6 +230,10 @@ wheel==0.37.1
     # via pip-tools
 widgetsnbextension==4.0.3
     # via ipywidgets
+zipp==3.9.0
+    # via
+    #   importlib-metadata
+    #   importlib-resources
 
 # The following packages are considered to be unsafe in a requirements file:
 # pip
 
@@ -0,0 +1,59 @@
+#
+# This file is autogenerated by pip-compile with python 3.8
+# To update, run:
+#
+#    pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
+#
+certifi==2022.9.24
+    # via requests
+charset-normalizer==2.1.1
+    # via requests
+click==8.1.3
+    # via nltk
+filelock==3.8.0
+    # via
+    #   huggingface-hub
+    #   transformers
+huggingface-hub==0.10.1
+    # via transformers
+idna==3.4
+    # via requests
+joblib==1.2.0
+    # via nltk
+lxml==4.9.1
+    # via unstructured (setup.py)
+nltk==3.7
+    # via unstructured (setup.py)
+numpy==1.23.4
+    # via transformers
+packaging==21.3
+    # via
+    #   huggingface-hub
+    #   transformers
+pyparsing==3.0.9
+    # via packaging
+pyyaml==6.0
+    # via
+    #   huggingface-hub
+    #   transformers
+regex==2022.9.13
+    # via
+    #   nltk
+    #   transformers
+requests==2.28.1
+    # via
+    #   huggingface-hub
+    #   transformers
+tokenizers==0.13.1
+    # via transformers
+tqdm==4.64.1
+    # via
+    #   huggingface-hub
+    #   nltk
+    #   transformers
+transformers==4.23.1
+    # via unstructured (setup.py)
+typing-extensions==4.4.0
+    # via huggingface-hub
+urllib3==1.26.12
+    # via requests
@@ -24,8 +24,12 @@ cycler==0.11.0
     # via matplotlib
 effdet==0.3.0
     # via layoutparser
+filelock==3.8.0
+    # via huggingface-hub
 fonttools==4.37.4
     # via matplotlib
+huggingface-hub==0.10.1
+    # via timm
 idna==3.4
     # via requests
 iopath==0.1.10
@@ -58,6 +62,7 @@ opencv-python==4.6.0.66
     # via layoutparser
 packaging==21.3
     # via
+    #   huggingface-hub
     #   matplotlib
     #   pytesseract
 pandas==1.5.0
@@ -96,12 +101,16 @@ pytz==2022.4
     # via pandas
 pyyaml==6.0
     # via
+    #   huggingface-hub
     #   layoutparser
     #   omegaconf
+    #   timm
 regex==2022.9.13
     # via nltk
 requests==2.28.1
-    # via torchvision
+    # via
+    #   huggingface-hub
+    #   torchvision
 scipy==1.9.2
     # via layoutparser
 six==1.16.0
@@ -121,10 +130,12 @@ torchvision==0.13.1
     #   timm
 tqdm==4.64.1
     # via
+    #   huggingface-hub
     #   iopath
     #   nltk
 typing-extensions==4.4.0
     # via
+    #   huggingface-hub
     #   iopath
     #   torch
     #   torchvision