From cbaf226b623fa5e3d9c7a71a22ed7962f3dc9995 Mon Sep 17 00:00:00 2001
From: AWS Gopher <aws-gopher@users.noreply.github.com>
Date: Wed, 20 Aug 2025 15:37:58 -0400
Subject: [PATCH 1/2] unstructured-sdk-go: prepare for v0.1.0-alpha.1 release

---
 test/testdata/170603762v7-841f6504.pdf.json | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 test/testdata/170603762v7-841f6504.pdf.json
diff --git a/test/testdata/170603762v7-841f6504.pdf.json b/test/testdata/170603762v7-841f6504.pdf.json
new file mode 100644
index 0000000..05fdef6
--- /dev/null
+++ b/test/testdata/170603762v7-841f6504.pdf.json
@@ -0,0 +1 @@
+[{"type": "UncategorizedText", "element_id": "9ca0534b589243748c57b92e91d747e7", "text": "", "metadata": {"category_depth": 0, "page_number": 1, "text_as_html": "<div class=\"Page\" data-page-number=\"1\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "e30a96d14ddb42c88fc960c9c02dcb7e", "text": "Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.", "metadata": {"category_depth": 1, "page_number": 1, "parent_id": "9ca0534b589243748c57b92e91d747e7", "text_as_html": "<p class=\"NarrativeText\">Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "e7ce45ab86aa42558c5315cfca7501db", "text": "Attention Is All You Need", "metadata": {"category_depth": 1, "page_number": 1, "parent_id": "9ca0534b589243748c57b92e91d747e7", "text_as_html": "<h1 class=\"Title\">Attention Is All You Need</h1>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "65c58c3e6e094ee6b6ffe0b46d56e93c", "text": "", "metadata": {"category_depth": 1, "page_number": 1, "parent_id": "9ca0534b589243748c57b92e91d747e7", "text_as_html": "<section class=\"Section\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "338750d2778f41fc881c7068de0efbb1", "text": "", "metadata": {"category_depth": 2, "page_number": 1, "parent_id": "65c58c3e6e094ee6b6ffe0b46d56e93c", "text_as_html": "<div class=\"Column\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "bcac086de79e46f4808da442678417fc", "text": "Ashish Vaswani*  Google Brain  avaswani@google.com", "metadata": {"category_depth": 3, "page_number": 1, "parent_id": "338750d2778f41fc881c7068de0efbb1", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Ashish Vaswani*</p><br  /><p class=\"Paragraph\">Google Brain</p><br  /><p class=\"Paragraph\">avaswani@google.com</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "3bd0d40b4c6842afa284df0dc79572f4", "text": "Llion Jones*  Google Research  llion@google.com", "metadata": {"category_depth": 3, "page_number": 1, "parent_id": "338750d2778f41fc881c7068de0efbb1", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Llion Jones*</p><br  /><p class=\"Paragraph\">Google Research</p><br  /><p class=\"Paragraph\">llion@google.com</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "64e98d3941ad48cc9664743e6abb54ee", "text": "", "metadata": {"category_depth": 2, "page_number": 1, "parent_id": "65c58c3e6e094ee6b6ffe0b46d56e93c", "text_as_html": "<div class=\"Column\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "ed34350d9e1b4f98a7b19696030a4b47", "text": "Noam Shazeer*  Google Brain  noam@google.com", "metadata": {"category_depth": 3, "page_number": 1, "parent_id": "64e98d3941ad48cc9664743e6abb54ee", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Noam Shazeer*</p><br  /><p class=\"Paragraph\">Google Brain</p><br  /><p class=\"Paragraph\">noam@google.com</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "7ea85b0795554234a075670472e3ee11", "text": "Aidan N. Gomez† ‡  University of Toronto  aidan@cs.toronto.edu", "metadata": {"category_depth": 3, "page_number": 1, "parent_id": "64e98d3941ad48cc9664743e6abb54ee", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Aidan N. Gomez† ‡</p><br  /><p class=\"Paragraph\">University of Toronto</p><br  /><p class=\"Paragraph\">aidan@cs.toronto.edu</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "9901f8d367314eb39d7691d68c9a05ff", "text": "", "metadata": {"category_depth": 2, "page_number": 1, "parent_id": "65c58c3e6e094ee6b6ffe0b46d56e93c", "text_as_html": "<div class=\"Column\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "d2699e558027400db36ab042f60bbdf7", "text": "Niki Parmar*  Google Research  nikip@google.com", "metadata": {"category_depth": 3, "page_number": 1, "parent_id": "9901f8d367314eb39d7691d68c9a05ff", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Niki Parmar*</p><br  /><p class=\"Paragraph\">Google Research</p><br  /><p class=\"Paragraph\">nikip@google.com</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "24f1a2695bf941509c49c3e21c2b1e69", "text": "Łukasz Kaiser*  Google Brain  lukaszkaiser@google.com", "metadata": {"category_depth": 3, "page_number": 1, "parent_id": "9901f8d367314eb39d7691d68c9a05ff", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Łukasz Kaiser*</p><br  /><p class=\"Paragraph\">Google Brain</p><br  /><p class=\"Paragraph\">lukaszkaiser@google.com</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "f044f88ecac144468693cff2f54b3ab2", "text": "", "metadata": {"category_depth": 2, "page_number": 1, "parent_id": "65c58c3e6e094ee6b6ffe0b46d56e93c", "text_as_html": "<div class=\"Column\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "f992dcab763d4bf9ade82bbab3a3210f", "text": "Jakob Uszkoreit*  Google Research  usz@google.com", "metadata": {"category_depth": 3, "page_number": 1, "parent_id": "f044f88ecac144468693cff2f54b3ab2", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Jakob Uszkoreit*</p><br  /><p class=\"Paragraph\">Google Research</p><br  /><p class=\"Paragraph\">usz@google.com</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "12b9c1a0850041fd9aa77832d74dde2c", "text": "Illia Polosukhin* ‡  illia.polosukhin@gmail.com", "metadata": {"category_depth": 2, "page_number": 1, "parent_id": "65c58c3e6e094ee6b6ffe0b46d56e93c", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Illia Polosukhin* ‡</p><br  /><p class=\"Paragraph\">illia.polosukhin@gmail.com</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "d26211093c3143ec8c16144485077ae3", "text": "Abstract The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data. *Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and has been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head attention and the parameter-free position representation and became the other person involved in nearly every detail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and tensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and efficient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and implementing tensor2tensor, replacing our earlier codebase, greatly improving results and massively accelerating our research. †Work performed while at Google Brain.  ‡Work performed while at Google Research. 31st Conference on Neural Information Processing Systems (NIPS 2017), Long Beach, CA, USA.", "metadata": {"category_depth": 1, "page_number": 1, "parent_id": "9ca0534b589243748c57b92e91d747e7", "text_as_html": "<section class=\"Abstract\"><h2 class=\"Subtitle\">Abstract</h2><p class=\"NarrativeText\">The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.</p><p class=\"NarrativeText\">*Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and has been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head attention and the parameter-free position representation and became the other person involved in nearly every detail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and tensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and efficient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and implementing tensor2tensor, replacing our earlier codebase, greatly improving results and massively accelerating our research.</p><p class=\"NarrativeText\"><p class=\"Paragraph\">†Work performed while at Google Brain.</p><br  /><p class=\"Paragraph\">‡Work performed while at Google Research.</p></p><p class=\"NarrativeText\">31st Conference on Neural Information Processing Systems (NIPS 2017), Long Beach, CA, USA.</p></section>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text": "", "metadata": {"category_depth": 0, "page_number": 2, "text_as_html": "<div class=\"Page\" data-page-number=\"2\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "a82105de051047659563eb5bd379e0c3", "text": "1 Introduction", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<h2 class=\"Heading\">1 Introduction</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "dc51c7d0daca4473871bb7532f937fa7", "text": "Recurrent neural networks, long short-term memory [ 13 ] and gated recurrent [ 7 ] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [ 35 , 2 , 5 ]. Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [ 38 , 24 , 15 ].", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Recurrent neural networks, long short-term memory [</p><cite class=\"Citation\">13</cite><p class=\"Paragraph\">] and gated recurrent [</p><cite class=\"Citation\">7</cite><p class=\"Paragraph\">] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [</p><cite class=\"Citation\">35</cite><p class=\"Paragraph\">,</p><cite class=\"Citation\">2</cite><p class=\"Paragraph\">,</p><cite class=\"Citation\">5</cite><p class=\"Paragraph\">]. Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [</p><cite class=\"Citation\">38</cite><p class=\"Paragraph\">,</p><cite class=\"Citation\">24</cite><p class=\"Paragraph\">,</p><cite class=\"Citation\">15</cite><p class=\"Paragraph\">].</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "5c8991c45772452c87cd3f5d51fcaa8b", "text": "Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states h t , as a function of the previous hidden state h t−1 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency through factorization tricks [ 21 ] and conditional computation [ 32 ], while also improving model performance in case of the latter. The fundamental constraint of sequential computation, however, remains.", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states h</p><sub class=\"FootnoteReference\">t</sub><p class=\"Paragraph\">, as a function of the previous hidden state h</p><sub class=\"FootnoteReference\">t−1</sub><p class=\"Paragraph\">and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency through factorization tricks [</p><cite class=\"Citation\">21</cite><p class=\"Paragraph\">] and conditional computation [</p><cite class=\"Citation\">32</cite><p class=\"Paragraph\">], while also improving model performance in case of the latter. The fundamental constraint of sequential computation, however, remains.</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "c720e0a2b3154f84845503ef8447f3e3", "text": "Attention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [ 2 , 19 ]. In all but a few cases [ 27 ], however, such attention mechanisms are used in conjunction with a recurrent network.", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Attention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [</p><cite class=\"Citation\">2</cite><p class=\"Paragraph\">,</p><cite class=\"Citation\">19</cite><p class=\"Paragraph\">]. In all but a few cases [</p><cite class=\"Citation\">27</cite><p class=\"Paragraph\">], however, such attention mechanisms are used in conjunction with a recurrent network.</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "656dcbeaaf394d19bd68b2e3f689d1b9", "text": "In this work we propose the Transformer, a model architecture eschewing recurrence and instead relying on an attention mechanism to draw global dependencies between input and output. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs.", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<p class=\"NarrativeText\">In this work we propose the Transformer, a model architecture eschewing recurrence and instead relying on an attention mechanism to draw global dependencies between input and output. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "85cde2a2c5a8490280dd12513c517abc", "text": "2 Background", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<h2 class=\"Heading\">2 Background</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "94a006eb5d1f49cca8096b65ae185bb0", "text": "The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU [ 16 ], ByteNet [ 18 ] and ConvS2S [ 9 ], all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In these models, the number of operations required to relate signals from two arbitrary input or output positions grows in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes it more difficult to learn dependencies between distant positions [ 12 ]. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described in section 3.2.", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU [</p><cite class=\"Citation\">16</cite><p class=\"Paragraph\">], ByteNet [</p><cite class=\"Citation\">18</cite><p class=\"Paragraph\">] and ConvS2S [</p><cite class=\"Citation\">9</cite><p class=\"Paragraph\">], all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In these models, the number of operations required to relate signals from two arbitrary input or output positions grows in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes it more difficult to learn dependencies between distant positions [</p><cite class=\"Citation\">12</cite><p class=\"Paragraph\">]. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described in section 3.2.</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "d6dfc79fe85046fd9210e5fc051045b6", "text": "Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [ 4 , 27 , 28 , 22 ].", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [</p><cite class=\"Citation\">4</cite><p class=\"Paragraph\">,</p><cite class=\"Citation\">27</cite><p class=\"Paragraph\">,</p><cite class=\"Citation\">28</cite><p class=\"Paragraph\">,</p><cite class=\"Citation\">22</cite><p class=\"Paragraph\">].</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "3d0ca8e0f9ef4039bd8e86b71117f633", "text": "End-to-end memory networks are based on a recurrent attention mechanism instead of sequence-aligned recurrence and have been shown to perform well on simple-language question answering and language modeling tasks [ 34 ].", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">End-to-end memory networks are based on a recurrent attention mechanism instead of sequence-aligned recurrence and have been shown to perform well on simple-language question answering and language modeling tasks [</p><cite class=\"Citation\">34</cite><p class=\"Paragraph\">].</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "4fc50a9bc50f4de1b252330600cfb27c", "text": "To the best of our knowledge, however, the Transformer is the first transduction model relying entirely on self-attention to compute representations of its input and output without using sequence-aligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate self-attention and discuss its advantages over models such as [ 17 , 18 ] and [ 9 ].", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">To the best of our knowledge, however, the Transformer is the first transduction model relying entirely on self-attention to compute representations of its input and output without using sequence-aligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate self-attention and discuss its advantages over models such as [</p><cite class=\"Citation\">17</cite><p class=\"Paragraph\">,</p><cite class=\"Citation\">18</cite><p class=\"Paragraph\">] and [</p><cite class=\"Citation\">9</cite><p class=\"Paragraph\">].</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "f3ca6575657d487d8255ce6ac32d822a", "text": "3 Model Architecture", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<h2 class=\"Heading\">3 Model Architecture</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "1b37f268a8d945ba9df119a660e15f3e", "text": "Most competitive neural sequence transduction models have an encoder-decoder structure [ 5 , 2 , 35 ]. Here, the encoder maps an input sequence of symbol representations (x 1 , ..., x n ) to a sequence of continuous representations z = (z 1 , ..., z n ). Given z, the decoder then generates an output sequence (y 1 , ..., y m ) of symbols one element at a time. At each step the model is auto-regressive [ 10 ], consuming the previously generated symbols as additional input when generating the next.", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Most competitive neural sequence transduction models have an encoder-decoder structure [</p><cite class=\"Citation\">5</cite><p class=\"Paragraph\">,</p><cite class=\"Citation\">2</cite><p class=\"Paragraph\">,</p><cite class=\"Citation\">35</cite><p class=\"Paragraph\">]. Here, the encoder maps an input sequence of symbol representations (x</p><sub class=\"FootnoteReference\">1</sub><p class=\"Paragraph\">, ..., x</p><sub class=\"FootnoteReference\">n</sub><p class=\"Paragraph\">) to a sequence of continuous representations z = (z</p><sub class=\"FootnoteReference\">1</sub><p class=\"Paragraph\">, ..., z</p><sub class=\"FootnoteReference\">n</sub><p class=\"Paragraph\">). Given z, the decoder then generates an output sequence (y</p><sub class=\"FootnoteReference\">1</sub><p class=\"Paragraph\">, ..., y</p><sub class=\"FootnoteReference\">m</sub><p class=\"Paragraph\">) of symbols one element at a time. At each step the model is auto-regressive [</p><cite class=\"Citation\">10</cite><p class=\"Paragraph\">], consuming the previously generated symbols as additional input when generating the next.</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "33f86e0f52284b99aa2873f6ac3602f8", "text": "2", "metadata": {"category_depth": 1, "page_number": 2, "parent_id": "3ddf36fb84a84ffeb170d9a6af0d246a", "text_as_html": "<span class=\"PageNumber\">2</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "bc996e5783254c709ded16e46393e58d", "text": "", "metadata": {"category_depth": 0, "page_number": 3, "text_as_html": "<div class=\"Page\" data-page-number=\"3\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Image", "element_id": "a022fcd07d884e2aac3658fc924fdcff", "text": "graph TD\n    A[Output Probabilities] --> B[Softmax]\n    B --> C[Linear]\n    C --> D[Add & Norm]\n    D --> E[Feed Forward]\n    E --> F[Add & Norm]\n    F --> G[Multi-Head Attention]\n    G --> H[Add & Norm]\n    H --> I[Masked Multi-Head Attention]\n    I --> J[Positional Encoding]\n    J --> K[Output Embedding]\n    K --> L[Outputs shifted right]\n    \n    M[Inputs] --> N[Input Embedding]\n    N --> O[Positional Encoding]\n    O --> P[Add & Norm]\n    P --> Q[Multi-Head Attention]\n    Q --> R[Add & Norm]\n    R --> S[Feed Forward]", "metadata": {"category_depth": 1, "page_number": 3, "parent_id": "bc996e5783254c709ded16e46393e58d", "text_as_html": "<figure class=\"Figure\"><pre class=\"CodeBlock\">graph TD\n    A[Output Probabilities] --> B[Softmax]\n    B --> C[Linear]\n    C --> D[Add & Norm]\n    D --> E[Feed Forward]\n    E --> F[Add & Norm]\n    F --> G[Multi-Head Attention]\n    G --> H[Add & Norm]\n    H --> I[Masked Multi-Head Attention]\n    I --> J[Positional Encoding]\n    J --> K[Output Embedding]\n    K --> L[Outputs shifted right]\n    \n    M[Inputs] --> N[Input Embedding]\n    N --> O[Positional Encoding]\n    O --> P[Add & Norm]\n    P --> Q[Multi-Head Attention]\n    Q --> R[Add & Norm]\n    R --> S[Feed Forward]</pre></figure>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "FigureCaption", "element_id": "1f1b01a94414456da59c6ecb733dd866", "text": "Figure 1: The Transformer - model architecture.", "metadata": {"category_depth": 1, "page_number": 3, "parent_id": "bc996e5783254c709ded16e46393e58d", "text_as_html": "<figcaption class=\"Caption\">Figure 1: The Transformer - model architecture.</figcaption>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "942656dba81644a6ba10f3d68c730be2", "text": "The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively.", "metadata": {"category_depth": 1, "page_number": 3, "parent_id": "bc996e5783254c709ded16e46393e58d", "text_as_html": "<p class=\"NarrativeText\">The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "5ba34a9c0f47477daac7e518ad88e9d2", "text": "3.1 Encoder and Decoder Stacks", "metadata": {"category_depth": 1, "page_number": 3, "parent_id": "bc996e5783254c709ded16e46393e58d", "text_as_html": "<h2 class=\"Subtitle\">3.1 Encoder and Decoder Stacks</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "b1f41d8ff7834be891bf1c5c30d728f5", "text": "Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network. We employ a residual connection [1] around each of the two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension d model = 512.", "metadata": {"category_depth": 1, "page_number": 3, "parent_id": "bc996e5783254c709ded16e46393e58d", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network. We employ a residual connection [1] around each of the two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension d</p><sub class=\"FootnoteReference\">model</sub><p class=\"Paragraph\">= 512.</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "e1257230509449bea38f3cdeb27a693f", "text": "Decoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack. Similar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization. We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position i can depend only on the known outputs at positions less than i.", "metadata": {"category_depth": 1, "page_number": 3, "parent_id": "bc996e5783254c709ded16e46393e58d", "text_as_html": "<p class=\"NarrativeText\">Decoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack. Similar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization. We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position i can depend only on the known outputs at positions less than i.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "eb6a7cfeaf024f5d83ecf2badeaa11fa", "text": "3.2 Attention", "metadata": {"category_depth": 1, "page_number": 3, "parent_id": "bc996e5783254c709ded16e46393e58d", "text_as_html": "<h2 class=\"Subtitle\">3.2 Attention</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "eba9100363d04cfdb99c9f05539a1f8c", "text": "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum", "metadata": {"category_depth": 1, "page_number": 3, "parent_id": "bc996e5783254c709ded16e46393e58d", "text_as_html": "<p class=\"NarrativeText\">An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "bdad21464b174e96a6fb99bfa47296cf", "text": "3", "metadata": {"category_depth": 1, "page_number": 3, "parent_id": "bc996e5783254c709ded16e46393e58d", "text_as_html": "<span class=\"PageNumber\">3</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "499eebfc8e5b4a0eb567280c7fa67524", "text": "", "metadata": {"category_depth": 0, "page_number": 4, "text_as_html": "<div class=\"Page\" data-page-number=\"4\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Image", "element_id": "87e39e215d3b4a60985eecc08044f90f", "text": "graph TD\n    A[MatMul] --> B[SoftMax]\n    B --> C[Mask opt.]\n    C --> D[Scale]\n    D --> E[MatMul]\n    E --> |Q| F((Q))\n    E --> |K| G((K))\n    E --> |V| H((V))\n    \n    subgraph Scaled Dot-Product Attention", "metadata": {"category_depth": 1, "page_number": 4, "parent_id": "499eebfc8e5b4a0eb567280c7fa67524", "text_as_html": "<figure class=\"Figure\"><pre class=\"CodeBlock\">graph TD\n    A[MatMul] --> B[SoftMax]\n    B --> C[Mask opt.]\n    C --> D[Scale]\n    D --> E[MatMul]\n    E --> |Q| F((Q))\n    E --> |K| G((K))\n    E --> |V| H((V))\n    \n    subgraph Scaled Dot-Product Attention</pre></figure>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Image", "element_id": "46db3a657dbd48978345e37efed3c7bf", "text": "graph TD\n    A[Linear] --> B[Concat]\n    B --> C[Scaled Dot-Product Attention]\n    D[Linear] --> |V| C\n    E[Linear] --> |K| C\n    F[Linear] --> |Q| C\n    C --> |h| B\n    \n    subgraph Multi-Head Attention", "metadata": {"category_depth": 1, "page_number": 4, "parent_id": "499eebfc8e5b4a0eb567280c7fa67524", "text_as_html": "<figure class=\"Figure\"><pre class=\"CodeBlock\">graph TD\n    A[Linear] --> B[Concat]\n    B --> C[Scaled Dot-Product Attention]\n    D[Linear] --> |V| C\n    E[Linear] --> |K| C\n    F[Linear] --> |Q| C\n    C --> |h| B\n    \n    subgraph Multi-Head Attention</pre></figure>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "FigureCaption", "element_id": "78afbe928dc94e6aaa3dcfd1505697f7", "text": "Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several attention layers running in parallel.", "metadata": {"category_depth": 1, "page_number": 4, "parent_id": "499eebfc8e5b4a0eb567280c7fa67524", "text_as_html": "<figcaption class=\"Caption\">Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several attention layers running in parallel.</figcaption>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "6b35cc887ab544ebb4790373afdd20d6", "text": "of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key.", "metadata": {"category_depth": 1, "page_number": 4, "parent_id": "499eebfc8e5b4a0eb567280c7fa67524", "text_as_html": "<p class=\"NarrativeText\">of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "7ba5df470806450e89157870d333734b", "text": "3.2.1 Scaled Dot-Product Attention", "metadata": {"category_depth": 1, "page_number": 4, "parent_id": "499eebfc8e5b4a0eb567280c7fa67524", "text_as_html": "<h3 class=\"Heading\">3.2.1 Scaled Dot-Product Attention</h3>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "8e643c05a5a240428930df57d5bba4e2", "text": "We call our particular attention \"Scaled Dot-Product Attention\" (Figure 2). The input consists of queries and keys of dimension dk, and values of dimension dv. We compute the dot products of the query with all keys, divide each by √dk, and apply a softmax function to obtain the weights on the values. In practice, we compute the attention function on a set of queries simultaneously, packed together into a matrix Q. The keys and values are also packed together into matrices K and V. We compute the matrix of outputs as:", "metadata": {"category_depth": 1, "page_number": 4, "parent_id": "499eebfc8e5b4a0eb567280c7fa67524", "text_as_html": "<p class=\"NarrativeText\">We call our particular attention \"Scaled Dot-Product Attention\" (Figure 2). The input consists of queries and keys of dimension dk, and values of dimension dv. We compute the dot products of the query with all keys, divide each by √dk, and apply a softmax function to obtain the weights on the values.</p><p class=\"NarrativeText\">In practice, we compute the attention function on a set of queries simultaneously, packed together into a matrix Q. The keys and values are also packed together into matrices K and V. We compute the matrix of outputs as:</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Formula", "element_id": "fff6e3b5f44240fd861d77e2903a96bc", "text": "Attention(Q, K, V) = softmax(QK^T/√dk)V                (1)", "metadata": {"category_depth": 1, "page_number": 4, "parent_id": "499eebfc8e5b4a0eb567280c7fa67524", "text_as_html": "<math class=\"Equation\">Attention(Q, K, V) = softmax(QK^T/√dk)V                (1)</math>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "874160bd683746d3a1b86ed597c71c61", "text": "The two most commonly used attention functions are additive attention [2], and dot-product (multiplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of 1/√dk. Additive attention computes the compatibility function using a feed-forward network with a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is much faster and more space-efficient in practice, since it can be implemented using highly optimized matrix multiplication code. While for small values of dk the two mechanisms perform similarly, additive attention outperforms dot product attention without scaling for larger values of dk [3]. We suspect that for large values of dk, the dot products grow large in magnitude, pushing the softmax function into regions where it has extremely small gradients.4 To counteract this effect, we scale the dot products by 1/√dk.", "metadata": {"category_depth": 1, "page_number": 4, "parent_id": "499eebfc8e5b4a0eb567280c7fa67524", "text_as_html": "<p class=\"NarrativeText\">The two most commonly used attention functions are additive attention [2], and dot-product (multiplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of 1/√dk. Additive attention computes the compatibility function using a feed-forward network with a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is much faster and more space-efficient in practice, since it can be implemented using highly optimized matrix multiplication code.</p><p class=\"NarrativeText\">While for small values of dk the two mechanisms perform similarly, additive attention outperforms dot product attention without scaling for larger values of dk [3]. We suspect that for large values of dk, the dot products grow large in magnitude, pushing the softmax function into regions where it has extremely small gradients.4 To counteract this effect, we scale the dot products by 1/√dk.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "e3167300e5c144c1996beaadec0c504e", "text": "3.2.2 Multi-Head Attention", "metadata": {"category_depth": 1, "page_number": 4, "parent_id": "499eebfc8e5b4a0eb567280c7fa67524", "text_as_html": "<h3 class=\"Heading\">3.2.2 Multi-Head Attention</h3>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "11a9e6a8c94146d8970166c71f03bae4", "text": "Instead of performing a single attention function with dmodel-dimensional keys, values and queries, we found it beneficial to linearly project the queries, keys and values h times with different, learned linear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of queries, keys and values we then perform the attention function in parallel, yielding dv-dimensional 4To illustrate why the dot products get large, assume that the components of q and k are independent random variables with mean 0 and variance 1. Then their dot product, q·k = ∑dk i=1 qi ki, has mean 0 and variance dk.", "metadata": {"category_depth": 1, "page_number": 4, "parent_id": "499eebfc8e5b4a0eb567280c7fa67524", "text_as_html": "<p class=\"NarrativeText\">Instead of performing a single attention function with dmodel-dimensional keys, values and queries, we found it beneficial to linearly project the queries, keys and values h times with different, learned linear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of queries, keys and values we then perform the attention function in parallel, yielding dv-dimensional</p><div class=\"Footnote\">4To illustrate why the dot products get large, assume that the components of q and k are independent random variables with mean 0 and variance 1. Then their dot product, q·k = ∑dk i=1 qi ki, has mean 0 and variance dk.</div>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "6cd7568a30cc41a78ff5eddb33091579", "text": "4", "metadata": {"category_depth": 1, "page_number": 4, "parent_id": "499eebfc8e5b4a0eb567280c7fa67524", "text_as_html": "<span class=\"PageNumber\">4</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text": "", "metadata": {"category_depth": 0, "page_number": 5, "text_as_html": "<div class=\"Page\" data-page-number=\"5\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "d4d29d991be542e69b394801e73a30ce", "text": "output values. These are concatenated and once again projected, resulting in the final values, as depicted in Figure 2. Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this.", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<p class=\"NarrativeText\">output values. These are concatenated and once again projected, resulting in the final values, as depicted in Figure 2.</p><p class=\"NarrativeText\">Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Formula", "element_id": "819e533cd7134573b28cf05af8510b62", "text": "MultiHead(Q, K, V) = Concat(head₁,...,headₕ)W^O where headᵢ = Attention(QW^Q_i, KW^K_i, VW^V_i)", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<math class=\"Formula\">MultiHead(Q, K, V) = Concat(head₁,...,headₕ)W^Owhere headᵢ = Attention(QW^Q_i, KW^K_i, VW^V_i)</math>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "cbb9ae7ce1b14ef09e8c70773f7b3131", "text": "Where the projections are parameter matrices W^Q_i ∈ ℝ^{dmodel×dk}, W^K_i ∈ ℝ^{dmodel×dk}, W^V_i ∈ ℝ^{dmodel×dv}, and W^O ∈ ℝ^{hdv×dmodel}. In this work we employ h = 8 parallel attention layers, or heads. For each of these we use dk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality.", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<p class=\"NarrativeText\">Where the projections are parameter matrices W^Q_i ∈ ℝ^{dmodel×dk}, W^K_i ∈ ℝ^{dmodel×dk}, W^V_i ∈ ℝ^{dmodel×dv}, and W^O ∈ ℝ^{hdv×dmodel}.</p><p class=\"NarrativeText\">In this work we employ h = 8 parallel attention layers, or heads. For each of these we use dk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "b72804e3da0341988747d5058615176c", "text": "3.2.3 Applications of Attention in our Model", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<h3 class=\"Heading\">3.2.3 Applications of Attention in our Model</h3>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "3153606a62784a4abf0ac1204cf98bac", "text": "The Transformer uses multi-head attention in three different ways:", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<p class=\"NarrativeText\">The Transformer uses multi-head attention in three different ways:</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "e6e1a18582204ec599126b5395312906", "text": "In \"encoder-decoder attention\" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [38, 2, 9]. The encoder contains self-attention layers. In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder. Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position. We need to prevent leftward information flow in the decoder to preserve the auto-regressive property. We implement this inside of scaled dot-product attention by masking out (setting to -∞) all values in the input of the softmax which correspond to illegal connections. See Figure 2.", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<ul class=\"UnorderedList\"><li class=\"ListItem\">In \"encoder-decoder attention\" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [38, 2, 9].</li><li class=\"ListItem\">The encoder contains self-attention layers. In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder.</li><li class=\"ListItem\">Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position. We need to prevent leftward information flow in the decoder to preserve the auto-regressive property. We implement this inside of scaled dot-product attention by masking out (setting to -∞) all values in the input of the softmax which correspond to illegal connections. See Figure 2.</li></ul>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "30412616038d4dcc84949d715d980606", "text": "3.3 Position-wise Feed-Forward Networks", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<h3 class=\"Heading\">3.3 Position-wise Feed-Forward Networks</h3>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "8f8acb6a36f74397b30b155edd1d877a", "text": "In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position separately and identically. This consists of two linear transformations with a ReLU activation in between.", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<p class=\"NarrativeText\">In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position separately and identically. This consists of two linear transformations with a ReLU activation in between.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Formula", "element_id": "fa650caff972495d96db716980c56c43", "text": "FFN(x) = max(0, xW₁ + b₁)W₂ + b₂ (2)", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<math class=\"Formula\">FFN(x) = max(0, xW₁ + b₁)W₂ + b₂ (2)</math>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "e6ad77e30d53417192abba2f66fd0461", "text": "While the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convolutions with kernel size 1. The dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality dff = 2048.", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<p class=\"NarrativeText\">While the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convolutions with kernel size 1. The dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality dff = 2048.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "9e5ed82128dc410a896865301c60da84", "text": "3.4 Embeddings and Softmax", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<h3 class=\"Heading\">3.4 Embeddings and Softmax</h3>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "b0062534139e4709948bca11068f41bb", "text": "Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [30]. In the embedding layers, we multiply those weights by √dmodel.", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<p class=\"NarrativeText\">Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [30]. In the embedding layers, we multiply those weights by √dmodel.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "5da546bf9c8949f585f3a4eb7cf24734", "text": "5", "metadata": {"category_depth": 1, "page_number": 5, "parent_id": "9ca6dd3a795f46e5932bdc4cf5626f70", "text_as_html": "<span class=\"PageNumber\">5</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "ea09ddc8dd46433bb2efe51d40041e76", "text": "", "metadata": {"category_depth": 0, "page_number": 6, "text_as_html": "<div class=\"Page\" data-page-number=\"6\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Table", "element_id": "8ef36b9d9dd744d4a6a58fb77a9e18cb", "text": "Layer Type Complexity per Layer Sequential Operations Maximum Path Length Self-Attention O(n² · d) O(1) O(1) Recurrent O(n · d²) O(n) O(n) Convolutional O(k · n · d²) O(1) O(logk(n)) Self-Attention (restricted) O(r · n · d) O(1) O(n/r)", "metadata": {"category_depth": 1, "page_number": 6, "parent_id": "ea09ddc8dd46433bb2efe51d40041e76", "text_as_html": "<table class=\"Table\"><thead><tr><th>Layer Type</th><th>Complexity per Layer</th><th>Sequential Operations</th><th>Maximum Path Length</th></tr></thead><tbody><tr><td>Self-Attention</td><td>O(n² · d)</td><td>O(1)</td><td>O(1)</td></tr><tr><td>Recurrent</td><td>O(n · d²)</td><td>O(n)</td><td>O(n)</td></tr><tr><td>Convolutional</td><td>O(k · n · d²)</td><td>O(1)</td><td>O(logk(n))</td></tr><tr><td>Self-Attention (restricted)</td><td>O(r · n · d)</td><td>O(1)</td><td>O(n/r)</td></tr></tbody></table>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "48768042fe814320aa9302672fae3fdd", "text": "3.5 Positional Encoding", "metadata": {"category_depth": 1, "page_number": 6, "parent_id": "ea09ddc8dd46433bb2efe51d40041e76", "text_as_html": "<h2 class=\"Subtitle\">3.5 Positional Encoding</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "5f7fb6ff2eb54d46bed60b9d1a78d578", "text": "Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence. To this end, we add \"positional encodings\" to the input embeddings at the bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel as the embeddings, so that the two can be summed. There are many choices of positional encodings, learned and fixed [9]. In this work, we use sine and cosine functions of different frequencies:", "metadata": {"category_depth": 1, "page_number": 6, "parent_id": "ea09ddc8dd46433bb2efe51d40041e76", "text_as_html": "<p class=\"NarrativeText\">Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence. To this end, we add \"positional encodings\" to the input embeddings at the bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel as the embeddings, so that the two can be summed. There are many choices of positional encodings, learned and fixed [9].</p><p class=\"NarrativeText\">In this work, we use sine and cosine functions of different frequencies:</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Formula", "element_id": "b06b2abbfc8f4580a8c7edead825e085", "text": "PE(pos,2i) = sin(pos/100002i/dmodel)", "metadata": {"category_depth": 1, "page_number": 6, "parent_id": "ea09ddc8dd46433bb2efe51d40041e76", "text_as_html": "<math class=\"Formula\">PE(pos,2i) = sin(pos/100002i/dmodel)</math>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Formula", "element_id": "a1488ff9cefd4478a4a24c7274be8550", "text": "PE(pos,2i+1) = cos(pos/100002i/dmodel)", "metadata": {"category_depth": 1, "page_number": 6, "parent_id": "ea09ddc8dd46433bb2efe51d40041e76", "text_as_html": "<math class=\"Formula\">PE(pos,2i+1) = cos(pos/100002i/dmodel)</math>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "670cf2454cc540a79079a857b9955d58", "text": "where pos is the position and i is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from 2π to 10000 · 2π. We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset k, PEpos+k can be represented as a linear function of PEpos. We also experimented with using learned positional embeddings [9] instead, and found that the two versions produced nearly identical results (see Table 3 row (E)). We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training.", "metadata": {"category_depth": 1, "page_number": 6, "parent_id": "ea09ddc8dd46433bb2efe51d40041e76", "text_as_html": "<p class=\"NarrativeText\">where pos is the position and i is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from 2π to 10000 · 2π. We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset k, PEpos+k can be represented as a linear function of PEpos.</p><p class=\"NarrativeText\">We also experimented with using learned positional embeddings [9] instead, and found that the two versions produced nearly identical results (see Table 3 row (E)). We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "6276823eb73a4256acab011891630e24", "text": "4 Why Self-Attention", "metadata": {"category_depth": 1, "page_number": 6, "parent_id": "ea09ddc8dd46433bb2efe51d40041e76", "text_as_html": "<h1 class=\"Title\">4 Why Self-Attention</h1>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "68b3440895ae45d381d8f273c7423ced", "text": "In this section we compare various aspects of self-attention layers to the recurrent and convolutional layers commonly used for mapping one variable-length sequence of symbol representations (x1, ..., xn) to another sequence of equal length (z1, ..., zn), with xi, zi ∈ Rd, such as a hidden layer in a typical sequence transduction encoder or decoder. Motivating our use of self-attention we consider three desiderata. One is the total computational complexity per layer. Another is the amount of computation that can be parallelized, as measured by the minimum number of sequential operations required. The third is the path length between long-range dependencies in the network. Learning long-range dependencies is a key challenge in many sequence transduction tasks. One key factor affecting the ability to learn such dependencies is the length of the paths forward and backward signals have to traverse in the network. The shorter these paths between any combination of positions in the input and output sequences, the easier it is to learn long-range dependencies [12]. Hence we also compare the maximum path length between any two input and output positions in networks composed of the different layer types. As noted in Table 1, a self-attention layer connects all positions with a constant number of sequentially executed operations, whereas a recurrent layer requires O(n) sequential operations. In terms of computational complexity, self-attention layers are faster than recurrent layers when the sequence", "metadata": {"category_depth": 1, "page_number": 6, "parent_id": "ea09ddc8dd46433bb2efe51d40041e76", "text_as_html": "<p class=\"NarrativeText\">In this section we compare various aspects of self-attention layers to the recurrent and convolutional layers commonly used for mapping one variable-length sequence of symbol representations (x1, ..., xn) to another sequence of equal length (z1, ..., zn), with xi, zi ∈ Rd, such as a hidden layer in a typical sequence transduction encoder or decoder. Motivating our use of self-attention we consider three desiderata.</p><p class=\"NarrativeText\">One is the total computational complexity per layer. Another is the amount of computation that can be parallelized, as measured by the minimum number of sequential operations required.</p><p class=\"NarrativeText\">The third is the path length between long-range dependencies in the network. Learning long-range dependencies is a key challenge in many sequence transduction tasks. One key factor affecting the ability to learn such dependencies is the length of the paths forward and backward signals have to traverse in the network. The shorter these paths between any combination of positions in the input and output sequences, the easier it is to learn long-range dependencies [12]. Hence we also compare the maximum path length between any two input and output positions in networks composed of the different layer types.</p><p class=\"NarrativeText\">As noted in Table 1, a self-attention layer connects all positions with a constant number of sequentially executed operations, whereas a recurrent layer requires O(n) sequential operations. In terms of computational complexity, self-attention layers are faster than recurrent layers when the sequence</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "e2b6021db1734a29882b6f133cce295e", "text": "6", "metadata": {"category_depth": 1, "page_number": 6, "parent_id": "ea09ddc8dd46433bb2efe51d40041e76", "text_as_html": "<span class=\"PageNumber\">6</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text": "", "metadata": {"category_depth": 0, "page_number": 7, "text_as_html": "<div class=\"Page\" data-page-number=\"7\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "22ee075775194812b87f943937ac4bb4", "text": "length n is smaller than the representation dimensionality d, which is most often the case with sentence representations used by state-of-the-art models in machine translations, such as word-piece [38] and byte-pair [31] representations. To improve computational performance for tasks involving very long sequences, self-attention could be restricted to considering only a neighborhood of size r in the input sequence centered around the respective output position. This would increase the maximum path length to O(n/r). We plan to investigate this approach further in future work. A single convolutional layer with kernel width k < n does not connect all pairs of input and output positions. Doing so requires a stack of O(n/k) convolutional layers in the case of contiguous kernels, or O(logk(n)) in the case of dilated convolutions [18], increasing the length of the longest paths between any two positions in the network. Convolutional layers are generally more expensive than recurrent layers, by a factor of k. Separable convolutions [6], however, decrease the complexity considerably, to O(k · n · d + n · d²). Even with k = n, however, the complexity of a separable convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer, the approach we take in our model. As side benefit, self-attention could yield more interpretable models. We inspect attention distributions from our models and present and discuss examples in the appendix. Not only do individual attention heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of the sentences.", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<p class=\"NarrativeText\">length n is smaller than the representation dimensionality d, which is most often the case with sentence representations used by state-of-the-art models in machine translations, such as word-piece [38] and byte-pair [31] representations. To improve computational performance for tasks involving very long sequences, self-attention could be restricted to considering only a neighborhood of size r in the input sequence centered around the respective output position. This would increase the maximum path length to O(n/r). We plan to investigate this approach further in future work.</p><p class=\"NarrativeText\">A single convolutional layer with kernel width k < n does not connect all pairs of input and output positions. Doing so requires a stack of O(n/k) convolutional layers in the case of contiguous kernels, or O(logk(n)) in the case of dilated convolutions [18], increasing the length of the longest paths between any two positions in the network. Convolutional layers are generally more expensive than recurrent layers, by a factor of k. Separable convolutions [6], however, decrease the complexity considerably, to O(k · n · d + n · d²). Even with k = n, however, the complexity of a separable convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer, the approach we take in our model.</p><p class=\"NarrativeText\">As side benefit, self-attention could yield more interpretable models. We inspect attention distributions from our models and present and discuss examples in the appendix. Not only do individual attention heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of the sentences.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "9b343ede981347b29cd63cfc4214adf7", "text": "5 Training", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<h1 class=\"Heading\">5 Training</h1>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "656b23ad8f784c89b448b211a449a605", "text": "This section describes the training regime for our models.", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<p class=\"NarrativeText\">This section describes the training regime for our models.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "f1669eb317a341b381f166489633778d", "text": "5.1 Training Data and Batching", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<h2 class=\"Heading\">5.1 Training Data and Batching</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "042ec3156863477bb25343244a3cfd98", "text": "We trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million sentence pairs. Sentences were encoded using byte-pair encoding [3], which has a shared source-target vocabulary of about 37000 tokens. For English-French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary [38]. Sentence pairs were batched together by approximate sequence length. Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens.", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<p class=\"NarrativeText\">We trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million sentence pairs. Sentences were encoded using byte-pair encoding [3], which has a shared source-target vocabulary of about 37000 tokens. For English-French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary [38]. Sentence pairs were batched together by approximate sequence length. Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "cc2f40f068f444bcb4ecd42905d65910", "text": "5.2 Hardware and Schedule", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<h2 class=\"Heading\">5.2 Hardware and Schedule</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "418f239df2134793a4ebd7afba060289", "text": "We trained our models on one machine with 8 NVIDIA P100 GPUs. For our base models using the hyperparameters described throughout the paper, each training step took about 0.4 seconds. We trained the base models for a total of 100,000 steps or 12 hours. For our big models(described on the bottom line of table 3), step time was 1.0 seconds. The big models were trained for 300,000 steps (3.5 days).", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<p class=\"NarrativeText\">We trained our models on one machine with 8 NVIDIA P100 GPUs. For our base models using the hyperparameters described throughout the paper, each training step took about 0.4 seconds. We trained the base models for a total of 100,000 steps or 12 hours. For our big models(described on the bottom line of table 3), step time was 1.0 seconds. The big models were trained for 300,000 steps (3.5 days).</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "70b3922cf305405385e20bbc7fd0f469", "text": "5.3 Optimizer", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<h2 class=\"Heading\">5.3 Optimizer</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "ec1e1784b6ed4a48be517eb1bbb95566", "text": "We used the Adam optimizer [20] with β₁ = 0.9, β₂ = 0.98 and ϵ = 10⁻⁹. We varied the learning rate over the course of training, according to the formula: lrate = d⁻⁰·⁵ₘₒdₑₗ · min(step_num⁻⁰·⁵, step_num · warmup_steps⁻¹·⁵) (3) This corresponds to increasing the learning rate linearly for the first warmup_steps training steps, and decreasing it thereafter proportionally to the inverse square root of the step number. We used warmup_steps = 4000.", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<p class=\"NarrativeText\">We used the Adam optimizer [20] with β₁ = 0.9, β₂ = 0.98 and ϵ = 10⁻⁹. We varied the learning rate over the course of training, according to the formula:</p><p class=\"NarrativeText\">lrate = d⁻⁰·⁵ₘₒdₑₗ · min(step_num⁻⁰·⁵, step_num · warmup_steps⁻¹·⁵) (3)</p><p class=\"NarrativeText\">This corresponds to increasing the learning rate linearly for the first warmup_steps training steps, and decreasing it thereafter proportionally to the inverse square root of the step number. We used warmup_steps = 4000.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "de5efc3e376d43699e8a38b697f2d70d", "text": "5.4 Regularization", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<h2 class=\"Heading\">5.4 Regularization</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "1a31b9f39f4e4fcf8afa253f333f7ed2", "text": "We employ three types of regularization during training:", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<p class=\"NarrativeText\">We employ three types of regularization during training:</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "e6c90a4be1514baea8fccc47d19b155b", "text": "7", "metadata": {"category_depth": 1, "page_number": 7, "parent_id": "d01758cfaf9941d68ccb11e2d7cbe29d", "text_as_html": "<span class=\"PageNumber\">7</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text": "", "metadata": {"category_depth": 0, "page_number": 8, "text_as_html": "<div class=\"Page\" data-page-number=\"8\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "5716ba17fdbb4e66b63b6c565a03681e", "text": "Table 2: The Transformer achieves better BLEU scores than previous state-of-the-art models on the English-to-German and English-to-French newstest2014 tests at a fraction of the training cost.", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<h2 class=\"Subtitle\">Table 2: The Transformer achieves better BLEU scores than previous state-of-the-art models on the English-to-German and English-to-French newstest2014 tests at a fraction of the training cost.</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Table", "element_id": "7b5bd5c2e9474796a453c3b65d01e374", "text": "Model BLEU Training Cost (FLOPs) EN-DE EN-FR EN-DE EN-FR ByteNet [18] 23.75 Deep-Att + PosUnk [39]  39.2  1.0 · 10²⁰ GNMT + RL [38] 24.6 39.92 2.3 · 10¹⁹ 1.4 · 10²⁰ ConvS2S [9] 25.16 40.46 9.6 · 10¹⁸ 1.5 · 10²⁰ MoE [32] 26.03 40.56 2.0 · 10¹⁹ 1.2 · 10²⁰ Deep-Att + PosUnk Ensemble [39]  40.4  8.0 · 10²⁰ GNMT + RL Ensemble [38] 26.30 41.16 1.8 · 10²⁰ 1.1 · 10²¹ ConvS2S Ensemble [9] 26.36 41.29 7.7 · 10¹⁹ 1.2 · 10²¹ Transformer (base model) 27.3 38.1 3.3 · 10¹⁸ Transformer (big) 28.4 41.8  2.3 · 10¹⁹", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<table class=\"Table\"><thead><tr><th>Model</th><th colspan=\"2\">BLEU</th><th colspan=\"2\">Training Cost (FLOPs)</th></tr><tr><th></th><th>EN-DE</th><th>EN-FR</th><th>EN-DE</th><th>EN-FR</th></tr></thead><tbody><tr><td>ByteNet [18]</td><td>23.75</td><td></td><td></td><td></td></tr><tr><td>Deep-Att + PosUnk [39]</td><td></td><td>39.2</td><td></td><td>1.0 · 10²⁰</td></tr><tr><td>GNMT + RL [38]</td><td>24.6</td><td>39.92</td><td>2.3 · 10¹⁹</td><td>1.4 · 10²⁰</td></tr><tr><td>ConvS2S [9]</td><td>25.16</td><td>40.46</td><td>9.6 · 10¹⁸</td><td>1.5 · 10²⁰</td></tr><tr><td>MoE [32]</td><td>26.03</td><td>40.56</td><td>2.0 · 10¹⁹</td><td>1.2 · 10²⁰</td></tr><tr><td>Deep-Att + PosUnk Ensemble [39]</td><td></td><td>40.4</td><td></td><td>8.0 · 10²⁰</td></tr><tr><td>GNMT + RL Ensemble [38]</td><td>26.30</td><td>41.16</td><td>1.8 · 10²⁰</td><td>1.1 · 10²¹</td></tr><tr><td>ConvS2S Ensemble [9]</td><td>26.36</td><td>41.29</td><td>7.7 · 10¹⁹</td><td>1.2 · 10²¹</td></tr><tr><td>Transformer (base model)</td><td>27.3</td><td>38.1</td><td>3.3 · 10¹⁸</td><td></td></tr><tr><td>Transformer (big)</td><td>28.4</td><td>41.8</td><td></td><td>2.3 · 10¹⁹</td></tr></tbody></table>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "282a2d8f589d45f986e6c953986b934f", "text": "Residual Dropout", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<h3 class=\"Heading\">Residual Dropout</h3>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "8d3fbfb85e314e0b937035233872ce43", "text": "We apply dropout [33] to the output of each sub-layer, before it is added to the sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of P drop = 0.1.", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">We apply dropout [33] to the output of each sub-layer, before it is added to the sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of P</p><sub class=\"FootnoteReference\">drop</sub><p class=\"Paragraph\">= 0.1.</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "9e4a66b5c20f44889ea5081ca1b64237", "text": "Label Smoothing", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<h3 class=\"Heading\">Label Smoothing</h3>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "34576fc12a4f42beb5c0a503cfe85898", "text": "During training, we employed label smoothing of value ϵ ls = 0.1 [36]. This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">During training, we employed label smoothing of value ϵ</p><sub class=\"FootnoteReference\">ls</sub><p class=\"Paragraph\">= 0.1 [36]. This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "467f014af4384959a54842554e386a82", "text": "6 Results", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<h2 class=\"Heading\">6 Results</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "969703f95ee64df68897bfbe8b356e6b", "text": "6.1 Machine Translation", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<h3 class=\"Heading\">6.1 Machine Translation</h3>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "06e8dbd61ab343c1a7daecc6200fbedb", "text": "On the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big) in Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0 BLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is listed in the bottom line of Table 3. Training took 3.5 days on 8 P100 GPUs. Even our base model surpasses all previously published models and ensembles, at a fraction of the training cost of any of the competitive models.", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<p class=\"NarrativeText\">On the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big) in Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0 BLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is listed in the bottom line of Table 3. Training took 3.5 days on 8 P100 GPUs. Even our base model surpasses all previously published models and ensembles, at a fraction of the training cost of any of the competitive models.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "db9a5804d83a42a68490e10f5b78c25e", "text": "On the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0, outperforming all of the previously published single models, at less than 1/4 the training cost of the previous state-of-the-art model. The Transformer (big) model trained for English-to-French used dropout rate P drop = 0.1, instead of 0.3.", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">On the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0, outperforming all of the previously published single models, at less than 1/4 the training cost of the previous state-of-the-art model. The Transformer (big) model trained for English-to-French used dropout rate P</p><sub class=\"FootnoteReference\">drop</sub><p class=\"Paragraph\">= 0.1, instead of 0.3.</p></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "86b57dcab5cd48bfa83479f3faf1a351", "text": "For the base models, we used a single model obtained by averaging the last 5 checkpoints, which were written at 10-minute intervals. For the big models, we averaged the last 20 checkpoints. We used beam search with a beam size of 4 and length penalty α = 0.6 [38]. These hyperparameters were chosen after experimentation on the development set. We set the maximum output length during inference to input length + 50, but terminate early when possible [38].", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<p class=\"NarrativeText\">For the base models, we used a single model obtained by averaging the last 5 checkpoints, which were written at 10-minute intervals. For the big models, we averaged the last 20 checkpoints. We used beam search with a beam size of 4 and length penalty α = 0.6 [38]. These hyperparameters were chosen after experimentation on the development set. We set the maximum output length during inference to input length + 50, but terminate early when possible [38].</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "adc0c6623ae342e490eeaa793421b307", "text": "Table 2 summarizes our results and compares our translation quality and training costs to other model architectures from the literature. We estimate the number of floating point operations used to train a model by multiplying the training time, the number of GPUs used, and an estimate of the sustained single-precision floating-point capacity of each GPU. 5", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">Table 2 summarizes our results and compares our translation quality and training costs to other model architectures from the literature. We estimate the number of floating point operations used to train a model by multiplying the training time, the number of GPUs used, and an estimate of the sustained single-precision floating-point capacity of each GPU.</p><sub class=\"FootnoteReference\">5</sub></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "fd1b0e8c13914264845bc3e0f3aa7a75", "text": "6.2 Model Variations", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<h3 class=\"Heading\">6.2 Model Variations</h3>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "e756fe9cb1db401c85685ba71bd133fb", "text": "To evaluate the importance of different components of the Transformer, we varied our base model in different ways, measuring the change in performance on English-to-German translation on the", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<p class=\"NarrativeText\">To evaluate the importance of different components of the Transformer, we varied our base model in different ways, measuring the change in performance on English-to-German translation on the</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "60090cf1f18e4c52bbeb7f1b9d8fc5ba", "text": "5 We used values of 2.8, 3.7, 6.0 and 9.5 TFLOPS for K80, K40, M40 and P100, respectively.", "metadata": {"category_depth": 1, "page_number": 8, "parent_id": "ae0b1482c76e4bb7ad07bc7e78d205d5", "text_as_html": "<div class=\"Footnote\"><span class=\"UncategorizedText\">5</span><p class=\"Paragraph\">We used values of 2.8, 3.7, 6.0 and 9.5 TFLOPS for K80, K40, M40 and P100, respectively.</p></div>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "eb70ea776c0c49af918a9e0cb15a7932", "text": "", "metadata": {"category_depth": 0, "page_number": 9, "text_as_html": "<div class=\"Page\" data-page-number=\"9\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "a8467b14ad2c42b6a5a80acfdf37ed08", "text": "Table 3: Variations on the Transformer architecture. Unlisted values are identical to those of the base model. All metrics are on the English-to-German translation development set, newstest2013. Listed perplexities are per-wordpiece, according to our byte-pair encoding, and should not be compared to per-word perplexities.", "metadata": {"category_depth": 1, "page_number": 9, "parent_id": "eb70ea776c0c49af918a9e0cb15a7932", "text_as_html": "<h2 class=\"Subtitle\">Table 3: Variations on the Transformer architecture. Unlisted values are identical to those of the base model. All metrics are on the English-to-German translation development set, newstest2013. Listed perplexities are per-wordpiece, according to our byte-pair encoding, and should not be compared to per-word perplexities.</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Table", "element_id": "6f9b98eddf314d6f90778fc659f5fa64", "text": "N d_model d_ff h d_k d_v P_drop ε_ls train steps PPL (dev) BLEU (dev) params ×10⁶ base 6 512 2048 8 64 64 0.1 0.1 100K 4.92 25.8 65 (A)    1 512 512    5.29 24.9 4 128 128    5.00 25.5 16 32 32    4.91 25.8 32 16 16    5.01 25.4 (B)      16    5.16 25.1 58 32    5.01 25.4 60 (C) 2         6.11 23.7 36 4         5.19 25.3 50 8         4.88 25.5 80 256   32 32    5.75 24.5 28 1024   128 128    4.66 26.0 168 1024       5.12 25.4 53 4096       4.75 26.2 90 (D)       0.0   5.77 24.6 0.2   4.95 25.5 0.0  4.67 25.3 0.2  5.47 25.7 (E) positional embedding instead of sinusoids 4.92 25.7 big 6 1024 4096 16   0.3  300K 4.33 26.4 213", "metadata": {"category_depth": 1, "page_number": 9, "parent_id": "eb70ea776c0c49af918a9e0cb15a7932", "text_as_html": "<table class=\"Table\"><thead><tr><th>N</th><th>d_model</th><th>d_ff</th><th>h</th><th>d_k</th><th>d_v</th><th>P_drop</th><th>ε_ls</th><th>train steps</th><th>PPL (dev)</th><th>BLEU (dev)</th><th>params ×10⁶</th></tr></thead><tbody><tr><td>base</td><td>6</td><td>512</td><td>2048</td><td>8</td><td>64</td><td>64</td><td>0.1</td><td>0.1</td><td>100K</td><td>4.92</td><td>25.8</td><td>65</td></tr><tr><td>(A)</td><td></td><td></td><td></td><td>1</td><td>512</td><td>512</td><td></td><td></td><td></td><td>5.29</td><td>24.9</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td>4</td><td>128</td><td>128</td><td></td><td></td><td></td><td>5.00</td><td>25.5</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td>16</td><td>32</td><td>32</td><td></td><td></td><td></td><td>4.91</td><td>25.8</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td>32</td><td>16</td><td>16</td><td></td><td></td><td></td><td>5.01</td><td>25.4</td><td></td></tr><tr><td>(B)</td><td></td><td></td><td></td><td></td><td></td><td>16</td><td></td><td></td><td></td><td>5.16</td><td>25.1</td><td>58</td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td>32</td><td></td><td></td><td></td><td>5.01</td><td>25.4</td><td>60</td></tr><tr><td>(C)</td><td>2</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>6.11</td><td>23.7</td><td>36</td></tr><tr><td></td><td>4</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>5.19</td><td>25.3</td><td>50</td></tr><tr><td></td><td>8</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>4.88</td><td>25.5</td><td>80</td></tr><tr><td></td><td></td><td>256</td><td></td><td></td><td>32</td><td>32</td><td></td><td></td><td></td><td>5.75</td><td>24.5</td><td>28</td></tr><tr><td></td><td></td><td>1024</td><td></td><td></td><td>128</td><td>128</td><td></td><td></td><td></td><td>4.66</td><td>26.0</td><td>168</td></tr><tr><td></td><td></td><td></td><td>1024</td><td></td><td></td><td></td><td></td><td></td><td></td><td>5.12</td><td>25.4</td><td>53</td></tr><tr><td></td><td></td><td></td><td>4096</td><td></td><td></td><td></td><td></td><td></td><td></td><td>4.75</td><td>26.2</td><td>90</td></tr><tr><td>(D)</td><td></td><td></td><td></td><td></td><td></td><td></td><td>0.0</td><td></td><td></td><td>5.77</td><td>24.6</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td>0.2</td><td></td><td></td><td>4.95</td><td>25.5</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>0.0</td><td></td><td>4.67</td><td>25.3</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>0.2</td><td></td><td>5.47</td><td>25.7</td><td></td></tr><tr><td>(E)</td><td colspan=\"11\">positional embedding instead of sinusoids</td><td>4.92</td><td>25.7</td><td></td></tr><tr><td>big</td><td>6</td><td>1024</td><td>4096</td><td>16</td><td></td><td></td><td>0.3</td><td></td><td>300K</td><td>4.33</td><td>26.4</td><td>213</td></tr></tbody></table>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "9f45c802c1014868a4cc471962828f47", "text": "development set, newstest2013. We used beam search as described in the previous section, but no checkpoint averaging. We present these results in Table 3. In Table 3 rows (A), we vary the number of attention heads and the attention key and value dimensions, keeping the amount of computation constant, as described in Section 3.2.2. While single-head attention is 0.9 BLEU worse than the best setting, quality also drops off with too many heads. In Table 3 rows (B), we observe that reducing the attention key size d_k hurts model quality. This suggests that determining compatibility is not easy and that a more sophisticated compatibility function than dot product may be beneficial. We further observe in rows (C) and (D) that, as expected, bigger models are better, and dropout is very helpful in avoiding over-fitting. In row (E) we replace our sinusoidal positional encoding with learned positional embeddings [9], and observe nearly identical results to the base model.", "metadata": {"category_depth": 1, "page_number": 9, "parent_id": "eb70ea776c0c49af918a9e0cb15a7932", "text_as_html": "<p class=\"NarrativeText\">development set, newstest2013. We used beam search as described in the previous section, but no checkpoint averaging. We present these results in Table 3.</p><p class=\"NarrativeText\">In Table 3 rows (A), we vary the number of attention heads and the attention key and value dimensions, keeping the amount of computation constant, as described in Section 3.2.2. While single-head attention is 0.9 BLEU worse than the best setting, quality also drops off with too many heads.</p><p class=\"NarrativeText\">In Table 3 rows (B), we observe that reducing the attention key size d_k hurts model quality. This suggests that determining compatibility is not easy and that a more sophisticated compatibility function than dot product may be beneficial. We further observe in rows (C) and (D) that, as expected, bigger models are better, and dropout is very helpful in avoiding over-fitting. In row (E) we replace our sinusoidal positional encoding with learned positional embeddings [9], and observe nearly identical results to the base model.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "ab69b98cb1334130b74d09871cb383e1", "text": "6.3 English Constituency Parsing", "metadata": {"category_depth": 1, "page_number": 9, "parent_id": "eb70ea776c0c49af918a9e0cb15a7932", "text_as_html": "<h3 class=\"Heading\">6.3 English Constituency Parsing</h3>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "e6de7dde9c64462d8edf610319d74b16", "text": "To evaluate if the Transformer can generalize to other tasks we performed experiments on English constituency parsing. This task presents specific challenges: the output is subject to strong structural constraints and is significantly longer than the input. Furthermore, RNN sequence-to-sequence models have not been able to attain state-of-the-art results in small-data regimes [37]. We trained a 4-layer transformer with d_model = 1024 on the Wall Street Journal (WSJ) portion of the Penn Treebank [25], about 40K training sentences. We also trained it in a semi-supervised setting, using the larger high-confidence and BerkelyParser corpus from with approximately 17M sentences [37]. We used a vocabulary of 16K tokens for the WSJ only setting and a vocabulary of 32K tokens for the semi-supervised setting. We performed only a small number of experiments to select the dropout, both attention and residual (section 5.4), learning rates and beam size on the Section 22 development set, all other parameters remained unchanged from the English-to-German base translation model. During inference, we", "metadata": {"category_depth": 1, "page_number": 9, "parent_id": "eb70ea776c0c49af918a9e0cb15a7932", "text_as_html": "<p class=\"NarrativeText\">To evaluate if the Transformer can generalize to other tasks we performed experiments on English constituency parsing. This task presents specific challenges: the output is subject to strong structural constraints and is significantly longer than the input. Furthermore, RNN sequence-to-sequence models have not been able to attain state-of-the-art results in small-data regimes [37].</p><p class=\"NarrativeText\">We trained a 4-layer transformer with d_model = 1024 on the Wall Street Journal (WSJ) portion of the Penn Treebank [25], about 40K training sentences. We also trained it in a semi-supervised setting, using the larger high-confidence and BerkelyParser corpus from with approximately 17M sentences [37]. We used a vocabulary of 16K tokens for the WSJ only setting and a vocabulary of 32K tokens for the semi-supervised setting.</p><p class=\"NarrativeText\">We performed only a small number of experiments to select the dropout, both attention and residual (section 5.4), learning rates and beam size on the Section 22 development set, all other parameters remained unchanged from the English-to-German base translation model. During inference, we</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "dd1e0144f7cd4029844f0bddb82f8d0f", "text": "9", "metadata": {"category_depth": 1, "page_number": 9, "parent_id": "eb70ea776c0c49af918a9e0cb15a7932", "text_as_html": "<span class=\"PageNumber\">9</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "91c69b10cd9b4be3a1c20dc6d155b7de", "text": "", "metadata": {"category_depth": 0, "page_number": 10, "text_as_html": "<div class=\"Page\" data-page-number=\"10\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "FigureCaption", "element_id": "acb11dfab54e4d3f8a7df7b38dc20027", "text": "Table 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23 of WSJ)", "metadata": {"category_depth": 1, "page_number": 10, "parent_id": "91c69b10cd9b4be3a1c20dc6d155b7de", "text_as_html": "<figcaption class=\"Caption\">Table 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23 of WSJ)</figcaption>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Table", "element_id": "c1a2327b46634b6a9187b2e88ba84a09", "text": "Parser Training WSJ 23 F1 Vinyals & Kaiser et al. (2014) [37] WSJ only, discriminative 88.3 Petrov et al. (2006) [29] WSJ only, discriminative 90.4 Zhu et al. (2013) [40] WSJ only, discriminative 90.4 Dyer et al. (2016) [8] WSJ only, discriminative 91.7 Transformer (4 layers) WSJ only, discriminative 91.3 Zhu et al. (2013) [40] semi-supervised 91.3 Huang & Harper (2009) [14] semi-supervised 91.3 McClosky et al. (2006) [26] semi-supervised 92.1 Vinyals & Kaiser et al. (2014) [37] semi-supervised 92.1 Transformer (4 layers) semi-supervised 92.7 Luong et al. (2015) [23] multi-task 93.0 Dyer et al. (2016) [8] generative 93.3", "metadata": {"category_depth": 1, "page_number": 10, "parent_id": "91c69b10cd9b4be3a1c20dc6d155b7de", "text_as_html": "<table class=\"Table\"><tbody><tr><th>Parser</th><th>Training</th><th>WSJ 23 F1</th></tr><tr><td>Vinyals &amp; Kaiser et al. (2014) [37]</td><td>WSJ only, discriminative</td><td>88.3</td></tr><tr><td>Petrov et al. (2006) [29]</td><td>WSJ only, discriminative</td><td>90.4</td></tr><tr><td>Zhu et al. (2013) [40]</td><td>WSJ only, discriminative</td><td>90.4</td></tr><tr><td>Dyer et al. (2016) [8]</td><td>WSJ only, discriminative</td><td>91.7</td></tr><tr><td>Transformer (4 layers)</td><td>WSJ only, discriminative</td><td>91.3</td></tr><tr><td>Zhu et al. (2013) [40]</td><td>semi-supervised</td><td>91.3</td></tr><tr><td>Huang &amp; Harper (2009) [14]</td><td>semi-supervised</td><td>91.3</td></tr><tr><td>McClosky et al. (2006) [26]</td><td>semi-supervised</td><td>92.1</td></tr><tr><td>Vinyals &amp; Kaiser et al. (2014) [37]</td><td>semi-supervised</td><td>92.1</td></tr><tr><td>Transformer (4 layers)</td><td>semi-supervised</td><td>92.7</td></tr><tr><td>Luong et al. (2015) [23]</td><td>multi-task</td><td>93.0</td></tr><tr><td>Dyer et al. (2016) [8]</td><td>generative</td><td>93.3</td></tr></tbody></table>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "18b8bb8390ac495687c11df0e6e08bb1", "text": "increased the maximum output length to input length + 300. We used a beam size of 21 and α = 0.3 for both WSJ only and the semi-supervised setting. Our results in Table 4 show that despite the lack of task-specific tuning our model performs surprisingly well, yielding better results than all previously reported models with the exception of the Recurrent Neural Network Grammar [8]. In contrast to RNN sequence-to-sequence models [37], the Transformer outperforms the Berkeley-Parser [29] even when training only on the WSJ training set of 40K sentences.", "metadata": {"category_depth": 1, "page_number": 10, "parent_id": "91c69b10cd9b4be3a1c20dc6d155b7de", "text_as_html": "<p class=\"NarrativeText\">increased the maximum output length to input length + 300. We used a beam size of 21 and α = 0.3 for both WSJ only and the semi-supervised setting.</p><p class=\"NarrativeText\">Our results in Table 4 show that despite the lack of task-specific tuning our model performs surprisingly well, yielding better results than all previously reported models with the exception of the Recurrent Neural Network Grammar [8].</p><p class=\"NarrativeText\">In contrast to RNN sequence-to-sequence models [37], the Transformer outperforms the Berkeley-Parser [29] even when training only on the WSJ training set of 40K sentences.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "f62736524e6a4324b1c149fdea80cafa", "text": "7 Conclusion", "metadata": {"category_depth": 1, "page_number": 10, "parent_id": "91c69b10cd9b4be3a1c20dc6d155b7de", "text_as_html": "<h1 class=\"Heading\">7 Conclusion</h1>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "f42355caab3041c3aa22cad6a6a58004", "text": "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles. We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours.", "metadata": {"category_depth": 1, "page_number": 10, "parent_id": "91c69b10cd9b4be3a1c20dc6d155b7de", "text_as_html": "<p class=\"NarrativeText\">In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention.</p><p class=\"NarrativeText\">For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles.</p><p class=\"NarrativeText\">We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "1d876fae8a6f4413affaa91f3252513d", "text": "The code we used to train and evaluate our models is available at https://github.com/tensorflow/tensor2tensor", "metadata": {"category_depth": 1, "page_number": 10, "parent_id": "91c69b10cd9b4be3a1c20dc6d155b7de", "text_as_html": "<p class=\"NarrativeText\"><p class=\"Paragraph\">The code we used to train and evaluate our models is available at</p><a class=\"Hyperlink\">https://github.com/tensorflow/tensor2tensor</a></p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "3a539cdf8607466d8e733b81b617d66c", "text": "Acknowledgements We are grateful to Nal Kalchbrenner and Stephan Gouws for their fruitful comments, corrections and inspiration.", "metadata": {"category_depth": 1, "page_number": 10, "parent_id": "91c69b10cd9b4be3a1c20dc6d155b7de", "text_as_html": "<p class=\"NarrativeText\">Acknowledgements We are grateful to Nal Kalchbrenner and Stephan Gouws for their fruitful comments, corrections and inspiration.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "b97e97d556f848e7af76b09392c2dfdc", "text": "References", "metadata": {"category_depth": 1, "page_number": 10, "parent_id": "91c69b10cd9b4be3a1c20dc6d155b7de", "text_as_html": "<h2 class=\"Heading\">References</h2>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "NarrativeText", "element_id": "944a5da2b78546438e6cdfb98ccae275", "text": "[1] Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. Layer normalization. arXiv preprint arXiv:1607.06450, 2016. [2] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly learning to align and translate. CoRR, abs/1409.0473, 2014. [3] Denny Britz, Anna Goldie, Minh-Thang Luong, and Quoc V. Le. Massive exploration of neural machine translation architectures. CoRR, abs/1703.03906, 2017. [4] Jianpeng Cheng, Li Dong, and Mirella Lapata. Long short-term memory-networks for machine reading. arXiv preprint arXiv:1601.06733, 2016.", "metadata": {"category_depth": 1, "page_number": 10, "parent_id": "91c69b10cd9b4be3a1c20dc6d155b7de", "text_as_html": "<p class=\"NarrativeText\">[1] Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. Layer normalization. arXiv preprint arXiv:1607.06450, 2016.</p><p class=\"NarrativeText\">[2] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly learning to align and translate. CoRR, abs/1409.0473, 2014.</p><p class=\"NarrativeText\">[3] Denny Britz, Anna Goldie, Minh-Thang Luong, and Quoc V. Le. Massive exploration of neural machine translation architectures. CoRR, abs/1703.03906, 2017.</p><p class=\"NarrativeText\">[4] Jianpeng Cheng, Li Dong, and Mirella Lapata. Long short-term memory-networks for machine reading. arXiv preprint arXiv:1601.06733, 2016.</p>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "45fb036a3a854ebc8216625a5628842f", "text": "10", "metadata": {"category_depth": 1, "page_number": 10, "parent_id": "91c69b10cd9b4be3a1c20dc6d155b7de", "text_as_html": "<span class=\"PageNumber\">10</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "42140ace75d04fe588874d75f174e4e8", "text": "", "metadata": {"category_depth": 0, "page_number": 11, "text_as_html": "<div class=\"Page\" data-page-number=\"11\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "c761bc71ebba448cac1cd59aa3b6d7df", "text": "", "metadata": {"category_depth": 1, "page_number": 11, "parent_id": "42140ace75d04fe588874d75f174e4e8", "text_as_html": "<section class=\"Section\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "842db2451daa453b81e3a0f13abab29f", "text": "Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Fethi Bougares, Holger Schwenk, and Yoshua Bengio. Learning phrase representations using rnn encoder-decoder for statistical machine translation. CoRR , abs/1406.1078, 2014. Francois Chollet. Xception: Deep learning with depthwise separable convolutions. arXiv preprint arXiv:1610.02357 , 2016. Junyoung Chung, Çaglar Gülçehre, Kyunghyun Cho, and Yoshua Bengio. Empirical evaluation of gated recurrent neural networks on sequence modeling. CoRR , abs/1412.3555, 2014. Chris Dyer, Adhiguna Kuncoro, Miguel Ballesteros, and Noah A. Smith. Recurrent neural network grammars. In Proc. of NAACL , 2016. Jonas Gehring, Michael Auli, David Grangier, Denis Yarats, and Yann N. Dauphin. Convolutional sequence to sequence learning. arXiv preprint arXiv:1705.03122 , 2, 2017. Alex Graves. Generating sequences with recurrent neural networks. arXiv preprint arXiv:1308.0850 , 2013. Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. Deep residual learning for image recognition. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition , pages 770–778, 2016. Sepp Hochreiter, Yoshua Bengio, Paolo Frasconi, and Jürgen Schmidhuber. Gradient flow in recurrent nets: the difficulty of learning long-term dependencies, 2001. Sepp Hochreiter and Jürgen Schmidhuber. Long short-term memory. Neural computation , 9(8):1735–1780, 1997. Zhongqiang Huang and Mary Harper. Self-training PCFG grammars with latent annotations across languages. In Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing , pages 832–841. ACL, August 2009. Rafal Jozefowicz, Oriol Vinyals, Mike Schuster, Noam Shazeer, and Yonghui Wu. Exploring the limits of language modeling. arXiv preprint arXiv:1602.02410 , 2016. Łukasz Kaiser and Samy Bengio. Can active memory replace attention? In Advances in Neural Information Processing Systems, (NIPS) , 2016. Łukasz Kaiser and Ilya Sutskever. Neural GPUs learn algorithms. In International Conference on Learning Representations (ICLR) , 2016. Nal Kalchbrenner, Lasse Espeholt, Karen Simonyan, Aaron van den Oord, Alex Graves, and Koray Kavukcuoglu. Neural machine translation in linear time. arXiv preprint arXiv:1610.10099 , 2, 2017. Yoon Kim, Carl Denton, Luong Hoang, and Alexander M. Rush. Structured attention networks. In International Conference on Learning Representations , 2017. Diederik Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In ICLR , 2015. Oleksii Kuchaiev and Boris Ginsburg. Factorization tricks for LSTM networks. arXiv preprint arXiv:1703.10722 , 2017. Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. A structured self-attentive sentence embedding. arXiv preprint arXiv:1703.03130 , 2017. Minh-Thang Luong, Quoc V. Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. Multi-task sequence to sequence learning. arXiv preprint arXiv:1511.06114 , 2015. Minh-Thang Luong, Hieu Pham, and Christopher D Manning. Effective approaches to attention-based neural machine translation. arXiv preprint arXiv:1508.04025 , 2015.", "metadata": {"category_depth": 2, "page_number": 11, "parent_id": "c761bc71ebba448cac1cd59aa3b6d7df", "text_as_html": "<ol class=\"OrderedList\" start=\"5\"><li class=\"ListItem\"><p class=\"Paragraph\">Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Fethi Bougares, Holger Schwenk, and Yoshua Bengio. Learning phrase representations using rnn encoder-decoder for statistical machine translation.</p><span class=\"UncategorizedText\">CoRR</span><p class=\"Paragraph\">, abs/1406.1078, 2014.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Francois Chollet. Xception: Deep learning with depthwise separable convolutions.</p><span class=\"UncategorizedText\">arXiv preprint</span><a class=\"Hyperlink\">arXiv:1610.02357</a><p class=\"Paragraph\">, 2016.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Junyoung Chung, Çaglar Gülçehre, Kyunghyun Cho, and Yoshua Bengio. Empirical evaluation of gated recurrent neural networks on sequence modeling.</p><span class=\"UncategorizedText\">CoRR</span><p class=\"Paragraph\">, abs/1412.3555, 2014.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Chris Dyer, Adhiguna Kuncoro, Miguel Ballesteros, and Noah A. Smith. Recurrent neural network grammars. In</p><span class=\"UncategorizedText\">Proc. of NAACL</span><p class=\"Paragraph\">, 2016.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Jonas Gehring, Michael Auli, David Grangier, Denis Yarats, and Yann N. Dauphin. Convolutional sequence to sequence learning.</p><span class=\"UncategorizedText\">arXiv preprint</span><a class=\"Hyperlink\">arXiv:1705.03122</a><p class=\"Paragraph\">, 2, 2017.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Alex Graves. Generating sequences with recurrent neural networks.</p><span class=\"UncategorizedText\">arXiv preprint</span><a class=\"Hyperlink\">arXiv:1308.0850</a><p class=\"Paragraph\">, 2013.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. Deep residual learning for image recognition. In</p><span class=\"UncategorizedText\">Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</span><p class=\"Paragraph\">, pages 770–778, 2016.</p></li><li class=\"ListItem\">Sepp Hochreiter, Yoshua Bengio, Paolo Frasconi, and Jürgen Schmidhuber. Gradient flow in recurrent nets: the difficulty of learning long-term dependencies, 2001.</li><li class=\"ListItem\"><p class=\"Paragraph\">Sepp Hochreiter and Jürgen Schmidhuber. Long short-term memory.</p><span class=\"UncategorizedText\">Neural computation</span><p class=\"Paragraph\">, 9(8):1735–1780, 1997.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Zhongqiang Huang and Mary Harper. Self-training PCFG grammars with latent annotations across languages. In</p><span class=\"UncategorizedText\">Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing</span><p class=\"Paragraph\">, pages 832–841. ACL, August 2009.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Rafal Jozefowicz, Oriol Vinyals, Mike Schuster, Noam Shazeer, and Yonghui Wu. Exploring the limits of language modeling.</p><span class=\"UncategorizedText\">arXiv preprint</span><a class=\"Hyperlink\">arXiv:1602.02410</a><p class=\"Paragraph\">, 2016.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Łukasz Kaiser and Samy Bengio. Can active memory replace attention? In</p><span class=\"UncategorizedText\">Advances in Neural Information Processing Systems, (NIPS)</span><p class=\"Paragraph\">, 2016.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Łukasz Kaiser and Ilya Sutskever. Neural GPUs learn algorithms. In</p><span class=\"UncategorizedText\">International Conference on Learning Representations (ICLR)</span><p class=\"Paragraph\">, 2016.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Nal Kalchbrenner, Lasse Espeholt, Karen Simonyan, Aaron van den Oord, Alex Graves, and Koray Kavukcuoglu. Neural machine translation in linear time.</p><span class=\"UncategorizedText\">arXiv preprint</span><a class=\"Hyperlink\">arXiv:1610.10099</a><p class=\"Paragraph\">, 2, 2017.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Yoon Kim, Carl Denton, Luong Hoang, and Alexander M. Rush. Structured attention networks. In</p><span class=\"UncategorizedText\">International Conference on Learning Representations</span><p class=\"Paragraph\">, 2017.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Diederik Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In</p><span class=\"UncategorizedText\">ICLR</span><p class=\"Paragraph\">, 2015.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Oleksii Kuchaiev and Boris Ginsburg. Factorization tricks for LSTM networks.</p><span class=\"UncategorizedText\">arXiv preprint</span><a class=\"Hyperlink\">arXiv:1703.10722</a><p class=\"Paragraph\">, 2017.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. A structured self-attentive sentence embedding.</p><span class=\"UncategorizedText\">arXiv preprint</span><a class=\"Hyperlink\">arXiv:1703.03130</a><p class=\"Paragraph\">, 2017.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Minh-Thang Luong, Quoc V. Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. Multi-task sequence to sequence learning.</p><span class=\"UncategorizedText\">arXiv preprint</span><a class=\"Hyperlink\">arXiv:1511.06114</a><p class=\"Paragraph\">, 2015.</p></li><li class=\"ListItem\"><p class=\"Paragraph\">Minh-Thang Luong, Hieu Pham, and Christopher D Manning. Effective approaches to attention-based neural machine translation.</p><span class=\"UncategorizedText\">arXiv preprint</span><a class=\"Hyperlink\">arXiv:1508.04025</a><p class=\"Paragraph\">, 2015.</p></li></ol>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "63a1033c47654646999b9d52fd655a11", "text": "11", "metadata": {"category_depth": 2, "page_number": 11, "parent_id": "c761bc71ebba448cac1cd59aa3b6d7df", "text_as_html": "<span class=\"PageNumber\">11</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "5a7d78b8c882479d8dfb017f080c8efa", "text": "", "metadata": {"category_depth": 0, "page_number": 12, "text_as_html": "<div class=\"Page\" data-page-number=\"12\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "3b8ef4016a854f7ab8c9bee25122a335", "text": "[25] Mitchell P Marcus, Mary Ann Marcinkiewicz, and Beatrice Santorini. Building a large annotated corpus of english: The penn treebank. Computational linguistics, 19(2):313–330, 1993.[26] David McClosky, Eugene Charniak, and Mark Johnson. Effective self-training for parsing. In Proceedings of the Human Language Technology Conference of the NAACL, Main Conference, pages 152–159. ACL, June 2006.[27] Ankur Parikh, Oscar Täckström, Dipanjan Das, and Jakob Uszkoreit. A decomposable attention model. In Empirical Methods in Natural Language Processing, 2016.[28] Romain Paulus, Caiming Xiong, and Richard Socher. A deep reinforced model for abstractive summarization. arXiv preprint arXiv:1705.04304 , 2017.[29] Slav Petrov, Leon Barrett, Romain Thibaux, and Dan Klein. Learning accurate, compact, and interpretable tree annotation. In Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the ACL, pages 433–440. ACL, July 2006.[30] Ofir Press and Lior Wolf. Using the output embedding to improve language models. arXiv preprint arXiv:1608.05859 , 2016.[31] Rico Sennrich, Barry Haddow, and Alexandra Birch. Neural machine translation of rare words with subword units. arXiv preprint arXiv:1508.07909 , 2015.[32] Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 , 2017.[33] Nitish Srivastava, Geoffrey E Hinton, Alex Krizhevsky, Ilya Sutskever, and Ruslan Salakhutdinov. Dropout: a simple way to prevent neural networks from overfitting. Journal of Machine Learning Research, 15(1):1929–1958, 2014.[34] Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, and Rob Fergus. End-to-end memory networks. In C. Cortes, N. D. Lawrence, D. D. Lee, M. Sugiyama, and R. Garnett, editors, Advances in Neural Information Processing Systems 28, pages 2440–2448. Curran Associates, Inc., 2015.[35] Ilya Sutskever, Oriol Vinyals, and Quoc VV Le. Sequence to sequence learning with neural networks. In Advances in Neural Information Processing Systems, pages 3104–3112, 2014.[36] Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, and Zbigniew Wojna. Rethinking the inception architecture for computer vision. CoRR, abs/1512.00567, 2015.[37] Vinyals & Kaiser, Koo, Petrov, Sutskever, and Hinton. Grammar as a foreign language. In Advances in Neural Information Processing Systems, 2015.[38] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google's neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 , 2016.[39] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with fast-forward connections for neural machine translation. CoRR, abs/1606.04199, 2016.[40] Muhua Zhu, Yue Zhang, Wenliang Chen, Min Zhang, and Jingbo Zhu. Fast and accurate shift-reduce constituent parsing. In Proceedings of the 51st Annual Meeting of the ACL (Volume 1: Long Papers), pages 434–443. ACL, August 2013.", "metadata": {"category_depth": 1, "page_number": 12, "parent_id": "5a7d78b8c882479d8dfb017f080c8efa", "text_as_html": "<ul class=\"Bibliography\"><p class=\"Paragraph\">[25] Mitchell P Marcus, Mary Ann Marcinkiewicz, and Beatrice Santorini. Building a large annotated corpus of english: The penn treebank. Computational linguistics, 19(2):313–330, 1993.[26] David McClosky, Eugene Charniak, and Mark Johnson. Effective self-training for parsing. In Proceedings of the Human Language Technology Conference of the NAACL, Main Conference, pages 152–159. ACL, June 2006.[27] Ankur Parikh, Oscar Täckström, Dipanjan Das, and Jakob Uszkoreit. A decomposable attention model. In Empirical Methods in Natural Language Processing, 2016.[28] Romain Paulus, Caiming Xiong, and Richard Socher. A deep reinforced model for abstractive summarization. arXiv preprint</p><a class=\"Hyperlink\">arXiv:1705.04304</a><p class=\"Paragraph\">, 2017.[29] Slav Petrov, Leon Barrett, Romain Thibaux, and Dan Klein. Learning accurate, compact, and interpretable tree annotation. In Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the ACL, pages 433–440. ACL, July 2006.[30] Ofir Press and Lior Wolf. Using the output embedding to improve language models. arXiv preprint</p><a class=\"Hyperlink\">arXiv:1608.05859</a><p class=\"Paragraph\">, 2016.[31] Rico Sennrich, Barry Haddow, and Alexandra Birch. Neural machine translation of rare words with subword units. arXiv preprint</p><a class=\"Hyperlink\">arXiv:1508.07909</a><p class=\"Paragraph\">, 2015.[32] Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint</p><a class=\"Hyperlink\">arXiv:1701.06538</a><p class=\"Paragraph\">, 2017.[33] Nitish Srivastava, Geoffrey E Hinton, Alex Krizhevsky, Ilya Sutskever, and Ruslan Salakhutdinov. Dropout: a simple way to prevent neural networks from overfitting. Journal of Machine Learning Research, 15(1):1929–1958, 2014.[34] Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, and Rob Fergus. End-to-end memory networks. In C. Cortes, N. D. Lawrence, D. D. Lee, M. Sugiyama, and R. Garnett, editors, Advances in Neural Information Processing Systems 28, pages 2440–2448. Curran Associates, Inc., 2015.[35] Ilya Sutskever, Oriol Vinyals, and Quoc VV Le. Sequence to sequence learning with neural networks. In Advances in Neural Information Processing Systems, pages 3104–3112, 2014.[36] Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, and Zbigniew Wojna. Rethinking the inception architecture for computer vision. CoRR, abs/1512.00567, 2015.[37] Vinyals & Kaiser, Koo, Petrov, Sutskever, and Hinton. Grammar as a foreign language. In Advances in Neural Information Processing Systems, 2015.[38] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google's neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint</p><a class=\"Hyperlink\">arXiv:1609.08144</a><p class=\"Paragraph\">, 2016.[39] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with fast-forward connections for neural machine translation. CoRR, abs/1606.04199, 2016.[40] Muhua Zhu, Yue Zhang, Wenliang Chen, Min Zhang, and Jingbo Zhu. Fast and accurate shift-reduce constituent parsing. In Proceedings of the 51st Annual Meeting of the ACL (Volume 1: Long Papers), pages 434–443. ACL, August 2013.</p></ul>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "05e3c51f825144c19135b2ddbba5f8ef", "text": "12", "metadata": {"category_depth": 1, "page_number": 12, "parent_id": "5a7d78b8c882479d8dfb017f080c8efa", "text_as_html": "<span class=\"PageNumber\">12</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "431c08c8e770472388d73572291bf394", "text": "", "metadata": {"category_depth": 0, "page_number": 13, "text_as_html": "<div class=\"Page\" data-page-number=\"13\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Title", "element_id": "f631ddf3f69847ed9a98786df555acd1", "text": "Attention Visualizations", "metadata": {"category_depth": 1, "page_number": 13, "parent_id": "431c08c8e770472388d73572291bf394", "text_as_html": "<h1 class=\"Title\">Attention Visualizations</h1>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Image", "element_id": "e8b76fd7b1124735a0ba2fefef9eb717", "text": "Two parallel attention visualization heatmaps showing long-distance dependencies in encoder self-attention, with colored lines indicating different attention heads focusing on the word &#x27;making&#x27;. Figure 3: An example of the attention mechanism following long-distance dependencies in the encoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of the verb 'making', completing the phrase 'making...more difficult'. Attentions here shown only for the word 'making'. Different colors represent different heads. Best viewed in color.", "metadata": {"category_depth": 1, "page_number": 13, "parent_id": "431c08c8e770472388d73572291bf394", "text_as_html": "<figure class=\"Figure\"><img class=\"Image\" alt=\"Two parallel attention visualization heatmaps showing long-distance dependencies in encoder self-attention, with colored lines indicating different attention heads focusing on the word &#x27;making&#x27;.\" /><figcaption class=\"Caption\">Figure 3: An example of the attention mechanism following long-distance dependencies in the encoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of the verb 'making', completing the phrase 'making...more difficult'. Attentions here shown only for the word 'making'. Different colors represent different heads. Best viewed in color.</figcaption></figure>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "a2fba16067654729a4b91ceb45220a31", "text": "13", "metadata": {"category_depth": 1, "page_number": 13, "parent_id": "431c08c8e770472388d73572291bf394", "text_as_html": "<span class=\"PageNumber\">13</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "631f121c59a34a10aa22cf27e5b1a9b0", "text": "", "metadata": {"category_depth": 0, "page_number": 14, "text_as_html": "<div class=\"Page\" data-page-number=\"14\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Image", "element_id": "2f115ed1665d4951963c0f73064e40e3", "text": "Two attention visualization diagrams showing attention patterns. Top diagram shows full attention patterns with multiple connecting lines. Bottom diagram shows isolated attention patterns focusing on the word &#x27;its&#x27; with fewer connections. Figure 4: Two attention heads, also in layer 5 of 6, apparently involved in anaphora resolution. Top: Full attentions for head 5. Bottom: Isolated attentions from just the word 'its' for attention heads 5 and 6. Note that the attentions are very sharp for this word.", "metadata": {"category_depth": 1, "page_number": 14, "parent_id": "631f121c59a34a10aa22cf27e5b1a9b0", "text_as_html": "<figure class=\"Figure\"><img class=\"Image\" alt=\"Two attention visualization diagrams showing attention patterns. Top diagram shows full attention patterns with multiple connecting lines. Bottom diagram shows isolated attention patterns focusing on the word &#x27;its&#x27; with fewer connections.\" /><figcaption class=\"Caption\">Figure 4: Two attention heads, also in layer 5 of 6, apparently involved in anaphora resolution. Top: Full attentions for head 5. Bottom: Isolated attentions from just the word 'its' for attention heads 5 and 6. Note that the attentions are very sharp for this word.</figcaption></figure>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "c8df50da51ec46a4b5bdb957925670a4", "text": "14", "metadata": {"category_depth": 1, "page_number": 14, "parent_id": "631f121c59a34a10aa22cf27e5b1a9b0", "text_as_html": "<span class=\"PageNumber\">14</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "UncategorizedText", "element_id": "3e3d333b07a3430facec54aac6457ff2", "text": "", "metadata": {"category_depth": 0, "page_number": 15, "text_as_html": "<div class=\"Page\" data-page-number=\"15\" />", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "Image", "element_id": "978bdf7bed404897a8b87e4065eb125f", "text": "Attention visualization showing two different attention heads from layer 5 of 6 in an encoder self-attention mechanism, displaying connection patterns between words in a sentence. The visualization shows green and red connection lines between words, demonstrating different learned patterns. Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the sentence. We give two such examples above, from two different heads from the encoder self-attention at layer 5 of 6. The heads clearly learned to perform different tasks.", "metadata": {"category_depth": 1, "page_number": 15, "parent_id": "3e3d333b07a3430facec54aac6457ff2", "text_as_html": "<figure class=\"Figure\"><img class=\"Image\" alt=\"Attention visualization showing two different attention heads from layer 5 of 6 in an encoder self-attention mechanism, displaying connection patterns between words in a sentence. The visualization shows green and red connection lines between words, demonstrating different learned patterns.\" /><figcaption class=\"Caption\">Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the sentence. We give two such examples above, from two different heads from the encoder self-attention at layer 5 of 6. The heads clearly learned to perform different tasks.</figcaption></figure>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}, {"type": "PageNumber", "element_id": "b137b7126ffc4f60b502b0048ac2c0c3", "text": "15", "metadata": {"category_depth": 1, "page_number": 15, "parent_id": "3e3d333b07a3430facec54aac6457ff2", "text_as_html": "<span class=\"PageNumber\">15</span>", "languages": ["eng"], "filetype": "application/pdf", "partitioner_type": "vlm_partition", "data_source": {}, "filename": "170603762v7-841f6504.pdf"}}]
\ No newline at end of file

From 9a2071a8978f6dcae0c3120f59a90a30bcb82481 Mon Sep 17 00:00:00 2001
From: AWS Gopher <aws-gopher@users.noreply.github.com>
Date: Thu, 21 Aug 2025 20:30:23 -0400
Subject: [PATCH 2/2] Refactor connector configurations for type safety and
 consistency

- Implemented the `Embedder` type, which previously was empty.
- Added predicate methods to `Enricher` for `isImage`, `isTable` and `isNER`; these are used for validating that only one of each type may be used in a set of Workflow nodes.
- Added `TestWorkflowNodeOrder` to validate ~500 permutations of workflow node types and ordering. An empty workflow node list is now considered invalid, as a partitioner node must always appear in the list.
- Consolidated all `<Foo>ConnectorConfigInput` types into `<Foo>ConnectorConfig`.
- Consolidated `<Foo>SourceConnectorConfig` and `<Foo>DestinationConnectorConfig` types into `<Foo>ConnectorConfig` types for Couchbase, S3, GCS, Kafka Cloud, Postgres, Snowflake, and OneDrive. A few fields changed from values to pointers along the way.
- ContextualChunkingStrategy removed from all chunking types, deafults to `"v1"` during JSON marshaling
- `WorkflowType` is removed from `CreateWorkflowRequest` and instead defaults to `"custom"` during JSON marshaling.
- Updated naming for Go-style PascalCase with uppercased acronyms (`Gcs` -> `GCS`, `Ibm` -> `IBM`); also fixed `Onedrive` -> `OneDrive`.
- Fixed copyright year in LICENSE
---
 LICENSE.md                 |   2 +-
 chunker_character.go       |  33 ++--
 chunker_page.go            |  27 +--
 chunker_similarity.go      |  27 +--
 chunker_title.go           |  29 ++--
 destination.go             | 163 ++++++++----------
 destination_create.go      | 329 +------------------------------------
 destination_create_test.go |   4 +-
 destination_get_test.go    |   2 +-
 destination_list_test.go   |   2 +-
 destination_update.go      |   2 +-
 destination_update_test.go |   4 +-
 embedder.go                | 162 ++++++++++++++++++
 embedder_test.go           | 184 +++++++++++++++++++++
 enricher.go                |   6 +-
 shared_config.go           | 200 +++++++++++++++++++---
 source.go                  | 175 ++++++++------------
 source_create.go           | 311 +----------------------------------
 source_create_test.go      |   6 +-
 source_get_test.go         |   2 +-
 source_list_test.go        |   2 +-
 source_update.go           |   2 +-
 source_update_test.go      |   6 +-
 test/destination_test.go   |  50 +++---
 test/main_test.go          |   4 +-
 test/source_test.go        |  56 ++++---
 workflow_create.go         |   9 +-
 workflow_create_test.go    |   1 -
 workflow_node.go           |  40 +++--
 workflow_node_test.go      | 136 +++++++++++++++
 30 files changed, 978 insertions(+), 998 deletions(-)
 create mode 100644 embedder.go
 create mode 100644 embedder_test.go
 create mode 100644 workflow_node_test.go

diff --git a/LICENSE.md b/LICENSE.md
index 662d242..cd9cadb 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2024 AWS Gopher
+Copyright (c) 2025 AWS Gopher
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/chunker_character.go b/chunker_character.go
index b04bc3a..6f3cd60 100644
--- a/chunker_character.go
+++ b/chunker_character.go
@@ -7,24 +7,17 @@ import (
 
 // ChunkerCharacter is a node that chunks text by character.
 type ChunkerCharacter struct {
-	ID                         string           `json:"-"`
-	Name                       string           `json:"-"`
-	APIURL                     string           `json:"unstructured_api_url,omitempty"`
-	APIKey                     string           `json:"unstructured_api_key,omitempty"`
-	IncludeOrigElements        bool             `json:"include_orig_elements,omitempty"`
-	NewAfterNChars             int              `json:"new_after_n_chars,omitempty"`
-	MaxCharacters              int              `json:"max_characters,omitempty"`
-	Overlap                    int              `json:"overlap,omitempty"`
-	OverlapAll                 bool             `json:"overlap_all"`
-	ContextualChunkingStrategy ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
+	ID                  string `json:"-"`
+	Name                string `json:"-"`
+	APIURL              string `json:"unstructured_api_url,omitempty"`
+	APIKey              string `json:"unstructured_api_key,omitempty"`
+	IncludeOrigElements bool   `json:"include_orig_elements,omitempty"`
+	NewAfterNChars      int    `json:"new_after_n_chars,omitempty"`
+	MaxCharacters       int    `json:"max_characters,omitempty"`
+	Overlap             int    `json:"overlap,omitempty"`
+	OverlapAll          bool   `json:"overlap_all"`
 }
 
-// ChunkingStrategy is a strategy for contextual chunking.
-type ChunkingStrategy string
-
-// ChunkingStrategyV1 is a strategy for contextual chunking.
-const ChunkingStrategyV1 = "v1"
-
 var _ WorkflowNode = new(ChunkerCharacter)
 
 // isNode implements the WorkflowNode interface.
@@ -34,7 +27,13 @@ func (c ChunkerCharacter) isNode() {}
 func (c ChunkerCharacter) MarshalJSON() ([]byte, error) {
 	type alias ChunkerCharacter
 
-	data, err := json.Marshal(alias(c))
+	data, err := json.Marshal(struct {
+		alias
+		ContextualChunkingStrategy string `json:"contextual_chunking_strategy"`
+	}{
+		alias:                      alias(c),
+		ContextualChunkingStrategy: "v1",
+	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal chunker character: %w", err)
 	}
diff --git a/chunker_page.go b/chunker_page.go
index 47e283f..90ed095 100644
--- a/chunker_page.go
+++ b/chunker_page.go
@@ -7,16 +7,15 @@ import (
 
 // ChunkerPage is a node that chunks text by character.
 type ChunkerPage struct {
-	ID                  string           `json:"-"`
-	Name                string           `json:"-"`
-	APIURL              string           `json:"unstructured_api_url,omitempty"`
-	APIKey              string           `json:"unstructured_api_key,omitempty"`
-	IncludeOrigElements bool             `json:"include_orig_elements,omitempty"`
-	NewAfterNChars      int              `json:"new_after_n_chars,omitempty"`
-	MaxCharacters       int              `json:"max_characters,omitempty"`
-	Overlap             int              `json:"overlap,omitempty"`
-	OverlapAll          bool             `json:"overlap_all"`
-	Strategy            ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
+	ID                  string `json:"-"`
+	Name                string `json:"-"`
+	APIURL              string `json:"unstructured_api_url,omitempty"`
+	APIKey              string `json:"unstructured_api_key,omitempty"`
+	IncludeOrigElements bool   `json:"include_orig_elements,omitempty"`
+	NewAfterNChars      int    `json:"new_after_n_chars,omitempty"`
+	MaxCharacters       int    `json:"max_characters,omitempty"`
+	Overlap             int    `json:"overlap,omitempty"`
+	OverlapAll          bool   `json:"overlap_all"`
 }
 
 var _ WorkflowNode = new(ChunkerPage)
@@ -28,7 +27,13 @@ func (c ChunkerPage) isNode() {}
 func (c ChunkerPage) MarshalJSON() ([]byte, error) {
 	type alias ChunkerPage
 
-	data, err := json.Marshal(alias(c))
+	data, err := json.Marshal(struct {
+		alias
+		ContextualChunkingStrategy string `json:"contextual_chunking_strategy"`
+	}{
+		alias:                      alias(c),
+		ContextualChunkingStrategy: "v1",
+	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal chunker page: %w", err)
 	}
diff --git a/chunker_similarity.go b/chunker_similarity.go
index 610761e..299fc29 100644
--- a/chunker_similarity.go
+++ b/chunker_similarity.go
@@ -7,16 +7,15 @@ import (
 
 // ChunkerSimilarity is a node that chunks text by character.
 type ChunkerSimilarity struct {
-	ID                  string           `json:"-"`
-	Name                string           `json:"-"`
-	APIURL              string           `json:"unstructured_api_url,omitempty"`
-	APIKey              string           `json:"unstructured_api_key,omitempty"`
-	IncludeOrigElements bool             `json:"include_orig_elements,omitempty"`
-	NewAfterNChars      int              `json:"new_after_n_chars,omitempty"`
-	MaxCharacters       int              `json:"max_characters,omitempty"`
-	Overlap             int              `json:"overlap,omitempty"`
-	OverlapAll          bool             `json:"overlap_all"`
-	Strategy            ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
+	ID                  string `json:"-"`
+	Name                string `json:"-"`
+	APIURL              string `json:"unstructured_api_url,omitempty"`
+	APIKey              string `json:"unstructured_api_key,omitempty"`
+	IncludeOrigElements bool   `json:"include_orig_elements,omitempty"`
+	NewAfterNChars      int    `json:"new_after_n_chars,omitempty"`
+	MaxCharacters       int    `json:"max_characters,omitempty"`
+	Overlap             int    `json:"overlap,omitempty"`
+	OverlapAll          bool   `json:"overlap_all"`
 }
 
 var _ WorkflowNode = new(ChunkerSimilarity)
@@ -28,7 +27,13 @@ func (c ChunkerSimilarity) isNode() {}
 func (c ChunkerSimilarity) MarshalJSON() ([]byte, error) {
 	type alias ChunkerSimilarity
 
-	data, err := json.Marshal(alias(c))
+	data, err := json.Marshal(struct {
+		alias
+		ContextualChunkingStrategy string `json:"contextual_chunking_strategy"`
+	}{
+		alias:                      alias(c),
+		ContextualChunkingStrategy: "v1",
+	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal chunker similarity: %w", err)
 	}
diff --git a/chunker_title.go b/chunker_title.go
index 5f81c17..da8d637 100644
--- a/chunker_title.go
+++ b/chunker_title.go
@@ -7,17 +7,16 @@ import (
 
 // ChunkerTitle is a node that chunks text by character.
 type ChunkerTitle struct {
-	ID                         string           `json:"-"`
-	Name                       string           `json:"-"`
-	APIURL                     string           `json:"unstructured_api_url,omitempty"`
-	APIKey                     string           `json:"unstructured_api_key,omitempty"`
-	CombineTextUnderN          int              `json:"combine_text_under_n_chars,omitempty"`
-	IncludeOrigElements        bool             `json:"include_orig_elements,omitempty"`
-	NewAfterNChars             int              `json:"new_after_n_chars,omitempty"`
-	MaxCharacters              int              `json:"max_characters,omitempty"`
-	Overlap                    int              `json:"overlap,omitempty"`
-	OverlapAll                 bool             `json:"overlap_all"`
-	ContextualChunkingStrategy ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
+	ID                  string `json:"-"`
+	Name                string `json:"-"`
+	APIURL              string `json:"unstructured_api_url,omitempty"`
+	APIKey              string `json:"unstructured_api_key,omitempty"`
+	CombineTextUnderN   int    `json:"combine_text_under_n_chars,omitempty"`
+	IncludeOrigElements bool   `json:"include_orig_elements,omitempty"`
+	NewAfterNChars      int    `json:"new_after_n_chars,omitempty"`
+	MaxCharacters       int    `json:"max_characters,omitempty"`
+	Overlap             int    `json:"overlap,omitempty"`
+	OverlapAll          bool   `json:"overlap_all"`
 }
 
 var _ WorkflowNode = new(ChunkerTitle)
@@ -29,7 +28,13 @@ func (c ChunkerTitle) isNode() {}
 func (c ChunkerTitle) MarshalJSON() ([]byte, error) {
 	type alias ChunkerTitle
 
-	data, err := json.Marshal(alias(c))
+	data, err := json.Marshal(struct {
+		alias
+		ContextualChunkingStrategy string `json:"contextual_chunking_strategy"`
+	}{
+		alias:                      alias(c),
+		ContextualChunkingStrategy: "v1",
+	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal chunker title: %w", err)
 	}
diff --git a/destination.go b/destination.go
index 1bcb4bf..c076719 100644
--- a/destination.go
+++ b/destination.go
@@ -11,24 +11,24 @@ import (
 var destinationConfigFactories = map[string]func() DestinationConfig{
 	ConnectorTypeAstraDB:                    func() DestinationConfig { return new(AstraDBConnectorConfig) },
 	ConnectorTypeAzureAISearch:              func() DestinationConfig { return new(AzureAISearchConnectorConfig) },
-	ConnectorTypeCouchbase:                  func() DestinationConfig { return new(CouchbaseDestinationConnectorConfig) },
+	ConnectorTypeCouchbase:                  func() DestinationConfig { return new(CouchbaseConnectorConfig) },
 	ConnectorTypeDatabricksVolumes:          func() DestinationConfig { return new(DatabricksVolumesConnectorConfig) },
 	ConnectorTypeDatabricksVolumeDeltaTable: func() DestinationConfig { return new(DatabricksVDTDestinationConnectorConfig) },
 	ConnectorTypeDeltaTable:                 func() DestinationConfig { return new(DeltaTableConnectorConfig) },
 	ConnectorTypeElasticsearch:              func() DestinationConfig { return new(ElasticsearchConnectorConfig) },
-	ConnectorTypeGCS:                        func() DestinationConfig { return new(GCSDestinationConnectorConfig) },
-	ConnectorTypeKafkaCloud:                 func() DestinationConfig { return new(KafkaCloudDestinationConnectorConfig) },
+	ConnectorTypeGCS:                        func() DestinationConfig { return new(GCSConnectorConfig) },
+	ConnectorTypeKafkaCloud:                 func() DestinationConfig { return new(KafkaCloudConnectorConfig) },
 	ConnectorTypeMilvus:                     func() DestinationConfig { return new(MilvusDestinationConnectorConfig) },
 	ConnectorTypeMongoDB:                    func() DestinationConfig { return new(MongoDBConnectorConfig) },
 	ConnectorTypeMotherDuck:                 func() DestinationConfig { return new(MotherduckDestinationConnectorConfig) },
 	ConnectorTypeNeo4j:                      func() DestinationConfig { return new(Neo4jDestinationConnectorConfig) },
-	ConnectorTypeOneDrive:                   func() DestinationConfig { return new(OneDriveDestinationConnectorConfig) },
+	ConnectorTypeOneDrive:                   func() DestinationConfig { return new(OneDriveConnectorConfig) },
 	ConnectorTypePinecone:                   func() DestinationConfig { return new(PineconeDestinationConnectorConfig) },
-	ConnectorTypePostgres:                   func() DestinationConfig { return new(PostgresDestinationConnectorConfig) },
+	ConnectorTypePostgres:                   func() DestinationConfig { return new(PostgresConnectorConfig) },
 	ConnectorTypeRedis:                      func() DestinationConfig { return new(RedisDestinationConnectorConfig) },
 	ConnectorTypeQdrantCloud:                func() DestinationConfig { return new(QdrantCloudDestinationConnectorConfig) },
-	ConnectorTypeS3:                         func() DestinationConfig { return new(S3DestinationConnectorConfig) },
-	ConnectorTypeSnowflake:                  func() DestinationConfig { return new(SnowflakeDestinationConnectorConfig) },
+	ConnectorTypeS3:                         func() DestinationConfig { return new(S3ConnectorConfig) },
+	ConnectorTypeSnowflake:                  func() DestinationConfig { return new(SnowflakeConnectorConfig) },
 	ConnectorTypeWeaviateCloud:              func() DestinationConfig { return new(WeaviateDestinationConnectorConfig) },
 	ConnectorTypeIBMWatsonxS3:               func() DestinationConfig { return new(IBMWatsonxS3DestinationConnectorConfig) },
 }
@@ -90,6 +90,7 @@ func (d *Destination) UnmarshalJSON(data []byte) error {
 // It provides a way to identify and work with different destination connector types.
 type DestinationConfig interface {
 	isDestinationConfig()
+	Type() string
 }
 
 type destinationconfig struct{}
@@ -101,13 +102,19 @@ func (d destinationconfig) isDestinationConfig() {}
 type AstraDBConnectorConfig struct {
 	destinationconfig
 
-	CollectionName string  `json:"collection_name"`
-	Keyspace       *string `json:"keyspace,omitempty"`
-	BatchSize      int     `json:"batch_size"`
-	APIEndpoint    string  `json:"api_endpoint"`
-	Token          string  `json:"token"`
+	CollectionName  string  `json:"collection_name"`
+	Keyspace        *string `json:"keyspace,omitempty"`
+	BatchSize       *int    `json:"batch_size,omitempty"`
+	APIEndpoint     string  `json:"api_endpoint"`
+	Token           string  `json:"token"`
+	FlattenMetadata *bool   `json:"flatten_metadata,omitempty"`
 }
 
+var _ DestinationConfig = (*AstraDBConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for AstraDB: "astra_db".
+func (c AstraDBConnectorConfig) Type() string { return ConnectorTypeAstraDB }
+
 // AzureAISearchConnectorConfig represents the configuration for an Azure AI Search destination connector.
 // It contains the endpoint, index name, and API key.
 type AzureAISearchConnectorConfig struct {
@@ -118,19 +125,10 @@ type AzureAISearchConnectorConfig struct {
 	Key      string `json:"key"`
 }
 
-// CouchbaseDestinationConnectorConfig represents the configuration for a Couchbase destination connector.
-// It contains connection details, bucket information, and authentication credentials.
-type CouchbaseDestinationConnectorConfig struct {
-	destinationconfig
+var _ DestinationConfig = (*AzureAISearchConnectorConfig)(nil)
 
-	Bucket           string  `json:"bucket"`
-	ConnectionString string  `json:"connection_string"`
-	Scope            *string `json:"scope,omitempty"`
-	Collection       *string `json:"collection,omitempty"`
-	BatchSize        int     `json:"batch_size"`
-	Username         string  `json:"username"`
-	Password         string  `json:"password"`
-}
+// Type always returns the connector type identifier for Azure AI Search: "azure_ai_search".
+func (c AzureAISearchConnectorConfig) Type() string { return ConnectorTypeAzureAISearch }
 
 // DatabricksVDTDestinationConnectorConfig represents the configuration for a Databricks Volume Delta Tables destination connector.
 // It contains server details, authentication, and table configuration.
@@ -150,6 +148,13 @@ type DatabricksVDTDestinationConnectorConfig struct {
 	VolumePath     *string `json:"volume_path,omitempty"`
 }
 
+var _ DestinationConfig = (*DatabricksVDTDestinationConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Databricks Volume Delta Table: "databricks_volume_delta_table".
+func (c DatabricksVDTDestinationConnectorConfig) Type() string {
+	return ConnectorTypeDatabricksVolumeDeltaTable
+}
+
 // DeltaTableConnectorConfig represents the configuration for a Delta Table destination connector.
 // It contains AWS credentials and table URI for Delta Lake storage.
 type DeltaTableConnectorConfig struct {
@@ -161,28 +166,10 @@ type DeltaTableConnectorConfig struct {
 	TableURI           string `json:"table_uri"`
 }
 
-// GCSDestinationConnectorConfig represents the configuration for a Google Cloud Storage destination connector.
-// It contains the remote URL and service account key for authentication.
-type GCSDestinationConnectorConfig struct {
-	destinationconfig
-
-	RemoteURL         string `json:"remote_url"`
-	ServiceAccountKey string `json:"service_account_key"`
-}
-
-// KafkaCloudDestinationConnectorConfig represents the configuration for a Kafka Cloud destination connector.
-// It contains broker details, topic information, and authentication credentials.
-type KafkaCloudDestinationConnectorConfig struct {
-	destinationconfig
+var _ DestinationConfig = (*DeltaTableConnectorConfig)(nil)
 
-	BootstrapServers string  `json:"bootstrap_servers"`
-	Port             *int    `json:"port,omitempty"`
-	GroupID          *string `json:"group_id,omitempty"`
-	Topic            string  `json:"topic"`
-	KafkaAPIKey      string  `json:"kafka_api_key"`
-	Secret           string  `json:"secret"`
-	BatchSize        *int    `json:"batch_size,omitempty"`
-}
+// Type always returns the connector type identifier for Delta Table: "delta_table".
+func (c DeltaTableConnectorConfig) Type() string { return ConnectorTypeDeltaTable }
 
 // MilvusDestinationConnectorConfig represents the configuration for a Milvus destination connector.
 // It contains connection details, collection information, and authentication.
@@ -198,6 +185,11 @@ type MilvusDestinationConnectorConfig struct {
 	RecordIDKey    string  `json:"record_id_key"`
 }
 
+var _ DestinationConfig = (*MilvusDestinationConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Milvus: "milvus".
+func (c MilvusDestinationConnectorConfig) Type() string { return ConnectorTypeMilvus }
+
 // Neo4jDestinationConnectorConfig represents the configuration for a Neo4j destination connector.
 // It contains database connection details and authentication credentials.
 type Neo4jDestinationConnectorConfig struct {
@@ -210,6 +202,11 @@ type Neo4jDestinationConnectorConfig struct {
 	BatchSize *int   `json:"batch_size,omitempty"`
 }
 
+var _ DestinationConfig = (*Neo4jDestinationConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Neo4j: "neo4j".
+func (c Neo4jDestinationConnectorConfig) Type() string { return ConnectorTypeNeo4j }
+
 // MotherduckDestinationConnectorConfig represents the configuration for a MotherDuck destination connector.
 // It contains database connection details and authentication credentials.
 type MotherduckDestinationConnectorConfig struct {
@@ -228,18 +225,10 @@ type MotherduckDestinationConnectorConfig struct {
 	RecordIDKey *string `json:"record_id_key,omitempty"`
 }
 
-// OneDriveDestinationConnectorConfig represents the configuration for a OneDrive destination connector.
-// It contains Microsoft Graph API authentication and file storage details.
-type OneDriveDestinationConnectorConfig struct {
-	destinationconfig
+var _ DestinationConfig = (*MotherduckDestinationConnectorConfig)(nil)
 
-	ClientID     string `json:"client_id"`
-	UserPName    string `json:"user_pname"`
-	Tenant       string `json:"tenant"`
-	AuthorityURL string `json:"authority_url"`
-	ClientCred   string `json:"client_cred"`
-	RemoteURL    string `json:"remote_url"`
-}
+// Type always returns the connector type identifier for MotherDuck: "mother_duck".
+func (c MotherduckDestinationConnectorConfig) Type() string { return ConnectorTypeMotherDuck }
 
 // PineconeDestinationConnectorConfig represents the configuration for a Pinecone destination connector.
 // It contains index details, API key, and namespace information.
@@ -252,19 +241,10 @@ type PineconeDestinationConnectorConfig struct {
 	BatchSize *int   `json:"batch_size,omitempty"`
 }
 
-// PostgresDestinationConnectorConfig represents the configuration for a PostgreSQL destination connector.
-// It contains database connection details and table configuration.
-type PostgresDestinationConnectorConfig struct {
-	destinationconfig
+var _ DestinationConfig = (*PineconeDestinationConnectorConfig)(nil)
 
-	Host      string `json:"host"`
-	Database  string `json:"database"`
-	Port      int    `json:"port"`
-	Username  string `json:"username"`
-	Password  string `json:"password"`
-	TableName string `json:"table_name"`
-	BatchSize int    `json:"batch_size"`
-}
+// Type always returns the connector type identifier for Pinecone: "pinecone".
+func (c PineconeDestinationConnectorConfig) Type() string { return ConnectorTypePinecone }
 
 // RedisDestinationConnectorConfig represents the configuration for a Redis destination connector.
 // It contains connection details, database selection, and authentication.
@@ -281,6 +261,11 @@ type RedisDestinationConnectorConfig struct {
 	BatchSize *int    `json:"batch_size,omitempty"`
 }
 
+var _ DestinationConfig = (*RedisDestinationConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Redis: "redis".
+func (c RedisDestinationConnectorConfig) Type() string { return ConnectorTypeRedis }
+
 // QdrantCloudDestinationConnectorConfig represents the configuration for a Qdrant Cloud destination connector.
 // It contains API endpoint, collection details, and authentication.
 type QdrantCloudDestinationConnectorConfig struct {
@@ -292,36 +277,10 @@ type QdrantCloudDestinationConnectorConfig struct {
 	BatchSize      *int   `json:"batch_size,omitempty"`
 }
 
-// S3DestinationConnectorConfig represents the configuration for an Amazon S3 destination connector.
-// It supports both AWS S3 and S3-compatible storage services for storing processed data.
-type S3DestinationConnectorConfig struct {
-	destinationconfig
-
-	RemoteURL   string  `json:"remote_url"`
-	Anonymous   bool    `json:"anonymous"`
-	Key         *string `json:"key,omitempty"`
-	Secret      *string `json:"secret,omitempty"`
-	Token       *string `json:"token,omitempty"`
-	EndpointURL *string `json:"endpoint_url,omitempty"`
-}
-
-// SnowflakeDestinationConnectorConfig represents the configuration for a Snowflake destination connector.
-// It contains account details, authentication, and table configuration.
-type SnowflakeDestinationConnectorConfig struct {
-	destinationconfig
+var _ DestinationConfig = (*QdrantCloudDestinationConnectorConfig)(nil)
 
-	Account     string  `json:"account"`
-	Role        string  `json:"role"`
-	User        string  `json:"user"`
-	Password    string  `json:"password"`
-	Host        string  `json:"host"`
-	Port        *int    `json:"port,omitempty"`
-	Database    string  `json:"database"`
-	Schema      *string `json:"schema,omitempty"`
-	TableName   *string `json:"table_name,omitempty"`
-	BatchSize   *int    `json:"batch_size,omitempty"`
-	RecordIDKey *string `json:"record_id_key,omitempty"`
-}
+// Type always returns the connector type identifier for Qdrant Cloud: "qdrant_cloud".
+func (c QdrantCloudDestinationConnectorConfig) Type() string { return ConnectorTypeQdrantCloud }
 
 // WeaviateDestinationConnectorConfig represents the configuration for a Weaviate destination connector.
 // It contains cluster URL, API key, and collection information.
@@ -333,6 +292,11 @@ type WeaviateDestinationConnectorConfig struct {
 	Collection *string `json:"collection,omitempty"`
 }
 
+var _ DestinationConfig = (*WeaviateDestinationConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Weaviate Cloud: "weaviate_cloud".
+func (c WeaviateDestinationConnectorConfig) Type() string { return ConnectorTypeWeaviateCloud }
+
 // IBMWatsonxS3DestinationConnectorConfig represents the configuration for an IBM Watsonx S3 destination connector.
 // It contains IBM Cloud authentication, storage endpoints, and table configuration.
 type IBMWatsonxS3DestinationConnectorConfig struct {
@@ -351,3 +315,8 @@ type IBMWatsonxS3DestinationConnectorConfig struct {
 	MaxRetries            *int    `json:"max_retries,omitempty"`
 	RecordIDKey           *string `json:"record_id_key,omitempty"`
 }
+
+var _ DestinationConfig = (*IBMWatsonxS3DestinationConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for IBM Watsonx S3: "ibm_watsonx_s3".
+func (c IBMWatsonxS3DestinationConnectorConfig) Type() string { return ConnectorTypeIBMWatsonxS3 }
diff --git a/destination_create.go b/destination_create.go
index 6b6cbbe..0795150 100644
--- a/destination_create.go
+++ b/destination_create.go
@@ -54,332 +54,5 @@ func (c *Client) CreateDestination(ctx context.Context, in CreateDestinationRequ
 // It contains the name, type, and configuration for the destination.
 type CreateDestinationRequest struct {
 	Name   string
-	Config DestinationConfigInput
+	Config DestinationConfig
 }
-
-// DestinationConfigInput is an interface that all destination connector configurations must implement.
-// It provides a way to identify the type of destination connector and marshal its configuration.
-type DestinationConfigInput interface {
-	isDestinationConfigInput()
-	Type() string
-}
-
-type destinationconfiginput struct{}
-
-func (s destinationconfiginput) isDestinationConfigInput() {}
-
-// AstraDBConnectorConfigInput represents the configuration for an AstraDB destination connector.
-// It contains the collection name, keyspace, batch size, API endpoint, and token.
-type AstraDBConnectorConfigInput struct {
-	destinationconfiginput
-
-	CollectionName  string  `json:"collection_name"`
-	Keyspace        *string `json:"keyspace,omitempty"`
-	BatchSize       *int    `json:"batch_size,omitempty"`
-	APIEndpoint     string  `json:"api_endpoint"`
-	Token           string  `json:"token"`
-	FlattenMetadata *bool   `json:"flatten_metadata,omitempty"`
-}
-
-// Type always returns the connector type identifier for AstraDB: "astradb".
-func (c AstraDBConnectorConfigInput) Type() string { return ConnectorTypeAstraDB }
-
-// AzureAISearchConnectorConfigInput represents the configuration for an Azure AI Search destination connector.
-// It contains the endpoint, index name, and API key.
-type AzureAISearchConnectorConfigInput struct {
-	destinationconfiginput
-
-	Endpoint string `json:"endpoint"`
-	Index    string `json:"index"`
-	Key      string `json:"key"`
-}
-
-// Type always returns the connector type identifier for Azure AI Search: "azure_ai_search".
-func (c AzureAISearchConnectorConfigInput) Type() string { return ConnectorTypeAzureAISearch }
-
-// CouchbaseDestinationConnectorConfigInput represents the configuration for a Couchbase destination connector.
-// It contains connection details, bucket information, and authentication credentials.
-type CouchbaseDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	Bucket           string  `json:"bucket"`
-	ConnectionString string  `json:"connection_string"`
-	Scope            *string `json:"scope,omitempty"`
-	Collection       *string `json:"collection,omitempty"`
-	BatchSize        int     `json:"batch_size"`
-	Username         string  `json:"username"`
-	Password         string  `json:"password"`
-}
-
-// Type always returns the connector type identifier for Couchbase: "couchbase".
-func (c CouchbaseDestinationConnectorConfigInput) Type() string { return ConnectorTypeCouchbase }
-
-// DatabricksVDTDestinationConnectorConfigInput represents the configuration for a Databricks Volume Delta Tables destination connector.
-// It contains server details, authentication, and table configuration.
-type DatabricksVDTDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	ServerHostname string  `json:"server_hostname"`
-	HTTPPath       string  `json:"http_path"`
-	Token          *string `json:"token,omitempty"`
-	ClientID       *string `json:"client_id,omitempty"`
-	ClientSecret   *string `json:"client_secret,omitempty"`
-	Catalog        string  `json:"catalog"`
-	Database       *string `json:"database,omitempty"`
-	TableName      *string `json:"table_name,omitempty"`
-	Schema         *string `json:"schema,omitempty"`
-	Volume         string  `json:"volume"`
-	VolumePath     *string `json:"volume_path,omitempty"`
-}
-
-// Type always returns the connector type identifier for Databricks Volume Delta Tables: "databricks_volume_delta_tables".
-func (c DatabricksVDTDestinationConnectorConfigInput) Type() string {
-	return ConnectorTypeDatabricksVolumeDeltaTable
-}
-
-// DeltaTableConnectorConfigInput represents the configuration for a Delta Table destination connector.
-// It contains AWS credentials and table URI for Delta Lake storage.
-type DeltaTableConnectorConfigInput struct {
-	destinationconfiginput
-
-	AwsAccessKeyID     string `json:"aws_access_key_id"`
-	AwsSecretAccessKey string `json:"aws_secret_access_key"`
-	AwsRegion          string `json:"aws_region"`
-	TableURI           string `json:"table_uri"`
-}
-
-// Type always returns the connector type identifier for Delta Table: "delta_table".
-func (c DeltaTableConnectorConfigInput) Type() string { return ConnectorTypeDeltaTable }
-
-// GCSDestinationConnectorConfigInput represents the configuration for a Google Cloud Storage destination connector.
-// It contains the remote URL and service account key for authentication.
-type GCSDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	RemoteURL         string `json:"remote_url"`
-	ServiceAccountKey string `json:"service_account_key"`
-}
-
-// Type always returns the connector type identifier for Google Cloud Storage: "gcs".
-func (c GCSDestinationConnectorConfigInput) Type() string { return ConnectorTypeGCS }
-
-// KafkaCloudDestinationConnectorConfigInput represents the configuration for a Kafka Cloud destination connector.
-// It contains broker details, topic information, and authentication credentials.
-type KafkaCloudDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	BootstrapServers string  `json:"bootstrap_servers"`
-	Port             *int    `json:"port,omitempty"`
-	GroupID          *string `json:"group_id,omitempty"`
-	Topic            string  `json:"topic"`
-	KafkaAPIKey      string  `json:"kafka_api_key"`
-	Secret           string  `json:"secret"`
-	BatchSize        *int    `json:"batch_size,omitempty"`
-}
-
-// Type always returns the connector type identifier for Kafka Cloud: "kafka-cloud".
-func (c KafkaCloudDestinationConnectorConfigInput) Type() string { return ConnectorTypeKafkaCloud }
-
-// MilvusDestinationConnectorConfigInput represents the configuration for a Milvus destination connector.
-// It contains connection details, collection information, and authentication.
-type MilvusDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	URI            string  `json:"uri"`
-	User           *string `json:"user,omitempty"`
-	Token          *string `json:"token,omitempty"`
-	Password       *string `json:"password,omitempty"`
-	DBName         *string `json:"db_name,omitempty"`
-	CollectionName string  `json:"collection_name"`
-	RecordIDKey    string  `json:"record_id_key"`
-}
-
-// Type always returns the connector type identifier for Milvus: "milvus".
-func (c MilvusDestinationConnectorConfigInput) Type() string { return ConnectorTypeMilvus }
-
-// MotherduckDestinationConnectorConfigInput represents the configuration for a MotherDuck destination connector.
-// It contains database connection details and authentication credentials.
-type MotherduckDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	Account     string  `json:"account"`
-	Role        string  `json:"role"`
-	User        string  `json:"user"`
-	Password    string  `json:"password"`
-	Host        string  `json:"host"`
-	Port        *int    `json:"port,omitempty"`
-	Database    string  `json:"database"`
-	Schema      *string `json:"schema,omitempty"`
-	TableName   *string `json:"table_name,omitempty"`
-	BatchSize   *int    `json:"batch_size,omitempty"`
-	RecordIDKey *string `json:"record_id_key,omitempty"`
-}
-
-// Type always returns the connector type identifier for Motherduck: "motherduck".
-func (c MotherduckDestinationConnectorConfigInput) Type() string { return ConnectorTypeMotherDuck }
-
-// Neo4jDestinationConnectorConfigInput represents the configuration for a Neo4j destination connector.
-// It contains database connection details and authentication credentials.
-type Neo4jDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	URI       string `json:"uri"`
-	Database  string `json:"database"`
-	Username  string `json:"username"`
-	Password  string `json:"password"`
-	BatchSize *int   `json:"batch_size,omitempty"`
-}
-
-// Type always returns the connector type identifier for Neo4j: "neo4j".
-func (c Neo4jDestinationConnectorConfigInput) Type() string { return ConnectorTypeNeo4j }
-
-// OneDriveDestinationConnectorConfigInput represents the configuration for a OneDrive destination connector.
-// It contains Microsoft Graph API authentication and file storage details.
-type OneDriveDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	ClientID     string `json:"client_id"`
-	UserPName    string `json:"user_pname"`
-	Tenant       string `json:"tenant"`
-	AuthorityURL string `json:"authority_url"`
-	ClientCred   string `json:"client_cred"`
-	RemoteURL    string `json:"remote_url"`
-}
-
-// Type always returns the connector type identifier for OneDrive: "onedrive".
-func (c OneDriveDestinationConnectorConfigInput) Type() string { return ConnectorTypeOneDrive }
-
-// PineconeDestinationConnectorConfigInput represents the configuration for a Pinecone destination connector.
-// It contains index details, API key, and namespace information.
-type PineconeDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	IndexName string `json:"index_name"`
-	APIKey    string `json:"api_key"`
-	Namespace string `json:"namespace"`
-	BatchSize *int   `json:"batch_size,omitempty"`
-}
-
-// Type always returns the connector type identifier for Pinecone: "pinecone".
-func (c PineconeDestinationConnectorConfigInput) Type() string { return ConnectorTypePinecone }
-
-// PostgresDestinationConnectorConfigInput represents the configuration for a PostgreSQL destination connector.
-// It contains database connection details and table configuration.
-type PostgresDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	Host      string `json:"host"`
-	Database  string `json:"database"`
-	Port      int    `json:"port"`
-	Username  string `json:"username"`
-	Password  string `json:"password"`
-	TableName string `json:"table_name"`
-	BatchSize int    `json:"batch_size"`
-}
-
-// Type always returns the connector type identifier for PostgreSQL: "postgres".
-func (c PostgresDestinationConnectorConfigInput) Type() string { return ConnectorTypePostgres }
-
-// RedisDestinationConnectorConfigInput represents the configuration for a Redis destination connector.
-// It contains connection details, database selection, and authentication.
-type RedisDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	Host      string  `json:"host"`
-	Port      *int    `json:"port,omitempty"`
-	Username  *string `json:"username,omitempty"`
-	Password  *string `json:"password,omitempty"`
-	URI       *string `json:"uri,omitempty"`
-	Database  *int    `json:"database,omitempty"`
-	SSL       *bool   `json:"ssl,omitempty"`
-	BatchSize *int    `json:"batch_size,omitempty"`
-}
-
-// Type always returns the connector type identifier for Redis: "redis".
-func (c RedisDestinationConnectorConfigInput) Type() string { return ConnectorTypeRedis }
-
-// QdrantCloudDestinationConnectorConfigInput represents the configuration for a Qdrant Cloud destination connector.
-// It contains API endpoint, collection details, and authentication.
-type QdrantCloudDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	URL            string `json:"url"`
-	APIKey         string `json:"api_key"`
-	CollectionName string `json:"collection_name"`
-	BatchSize      *int   `json:"batch_size,omitempty"`
-}
-
-// Type always returns the connector type identifier for Qdrant Cloud: "qdrant-cloud".
-func (c QdrantCloudDestinationConnectorConfigInput) Type() string { return ConnectorTypeQdrantCloud }
-
-// S3DestinationConnectorConfigInput represents the configuration for an Amazon S3 destination connector.
-// It supports both AWS S3 and S3-compatible storage services for storing processed data.
-type S3DestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	RemoteURL   string  `json:"remote_url"`
-	Anonymous   *bool   `json:"anonymous,omitempty"`
-	Key         *string `json:"key,omitempty"`
-	Secret      *string `json:"secret,omitempty"`
-	Token       *string `json:"token,omitempty"`
-	EndpointURL *string `json:"endpoint_url,omitempty"`
-}
-
-// Type always returns the connector type identifier for S3: "s3".
-func (c S3DestinationConnectorConfigInput) Type() string { return ConnectorTypeS3 }
-
-// SnowflakeDestinationConnectorConfigInput represents the configuration for a Snowflake destination connector.
-// It contains account details, authentication, and table configuration.
-type SnowflakeDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	Account     string  `json:"account"`
-	Role        string  `json:"role"`
-	User        string  `json:"user"`
-	Password    string  `json:"password"`
-	Host        string  `json:"host"`
-	Port        *int    `json:"port,omitempty"`
-	Database    string  `json:"database"`
-	Schema      *string `json:"schema,omitempty"`
-	TableName   *string `json:"table_name,omitempty"`
-	BatchSize   *int    `json:"batch_size,omitempty"`
-	RecordIDKey *string `json:"record_id_key,omitempty"`
-}
-
-// Type always returns the connector type identifier for Snowflake: "snowflake".
-func (c SnowflakeDestinationConnectorConfigInput) Type() string { return ConnectorTypeSnowflake }
-
-// WeaviateDestinationConnectorConfigInput represents the configuration for a Weaviate destination connector.
-// It contains cluster URL, API key, and collection information.
-type WeaviateDestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	ClusterURL string  `json:"cluster_url"`
-	APIKey     string  `json:"api_key"`
-	Collection *string `json:"collection,omitempty"`
-}
-
-// Type always returns the connector type identifier for Weaviate Cloud: "weaviate-cloud".
-func (c WeaviateDestinationConnectorConfigInput) Type() string { return ConnectorTypeWeaviateCloud }
-
-// IBMWatsonxS3DestinationConnectorConfigInput represents the configuration for an IBM Watsonx S3 destination connector.
-// It contains IBM Cloud authentication, storage endpoints, and table configuration.
-type IBMWatsonxS3DestinationConnectorConfigInput struct {
-	destinationconfiginput
-
-	IAMApiKey             string  `json:"iam_api_key"`
-	AccessKeyID           string  `json:"access_key_id"`
-	SecretAccessKey       string  `json:"secret_access_key"`
-	IcebergEndpoint       string  `json:"iceberg_endpoint"`
-	ObjectStorageEndpoint string  `json:"object_storage_endpoint"`
-	ObjectStorageRegion   string  `json:"object_storage_region"`
-	Catalog               string  `json:"catalog"`
-	MaxRetriesConnection  *int    `json:"max_retries_connection,omitempty"`
-	Namespace             string  `json:"namespace"`
-	Table                 string  `json:"table"`
-	MaxRetries            *int    `json:"max_retries,omitempty"`
-	RecordIDKey           *string `json:"record_id_key,omitempty"`
-}
-
-// Type always returns the connector type identifier for IBM Watsonx S3: "ibm_watsonx_s3".
-func (c IBMWatsonxS3DestinationConnectorConfigInput) Type() string { return ConnectorTypeIBMWatsonxS3 }
diff --git a/destination_create_test.go b/destination_create_test.go
index 622e7a9..9dbf89c 100644
--- a/destination_create_test.go
+++ b/destination_create_test.go
@@ -35,7 +35,7 @@ func TestCreateDestination(t *testing.T) {
 	destination, err := client.CreateDestination(t.Context(), CreateDestinationRequest{
 		Name: "test_destination_name",
 
-		Config: &S3DestinationConnectorConfigInput{
+		Config: &S3ConnectorConfig{
 			RemoteURL: "s3://mock-s3-connector",
 			Key:       String("blah"),
 			Secret:    String("blah"),
@@ -53,7 +53,7 @@ func TestCreateDestination(t *testing.T) {
 		t.Error(err)
 	}
 
-	cfg, ok := destination.Config.(*S3DestinationConnectorConfig)
+	cfg, ok := destination.Config.(*S3ConnectorConfig)
 	if !ok {
 		t.Errorf("expected destination config to be %T, got %T", cfg, destination.Config)
 	}
diff --git a/destination_get_test.go b/destination_get_test.go
index 28ebc00..abce398 100644
--- a/destination_get_test.go
+++ b/destination_get_test.go
@@ -54,7 +54,7 @@ func TestGetDestination(t *testing.T) {
 		t.Error(err)
 	}
 
-	cfg, ok := destination.Config.(*S3DestinationConnectorConfig)
+	cfg, ok := destination.Config.(*S3ConnectorConfig)
 	if !ok {
 		t.Errorf("expected destination config to be %T, got %T", cfg, destination.Config)
 	}
diff --git a/destination_list_test.go b/destination_list_test.go
index 487b49a..f54c16f 100644
--- a/destination_list_test.go
+++ b/destination_list_test.go
@@ -56,7 +56,7 @@ func TestListDestinations(t *testing.T) {
 		t.Error(err)
 	}
 
-	cfg, ok := destination.Config.(*S3DestinationConnectorConfig)
+	cfg, ok := destination.Config.(*S3ConnectorConfig)
 	if !ok {
 		t.Errorf("expected destination config to be %T, got %T", cfg, destination.Config)
 	}
diff --git a/destination_update.go b/destination_update.go
index 77b3415..9229d90 100644
--- a/destination_update.go
+++ b/destination_update.go
@@ -11,7 +11,7 @@ import (
 // UpdateDestinationRequest represents the request to update a destination connector.
 type UpdateDestinationRequest struct {
 	ID     string
-	Config DestinationConfigInput
+	Config DestinationConfig
 }
 
 // UpdateDestination updates the configuration of an existing destination connector.
diff --git a/destination_update_test.go b/destination_update_test.go
index 3ab9979..686c3fe 100644
--- a/destination_update_test.go
+++ b/destination_update_test.go
@@ -42,7 +42,7 @@ func TestUpdateDestination(t *testing.T) {
 
 	updated, err := client.UpdateDestination(t.Context(), UpdateDestinationRequest{
 		ID: id,
-		Config: &S3DestinationConnectorConfigInput{
+		Config: &S3ConnectorConfig{
 			RemoteURL: "s3://mock-s3-connector",
 			Key:       String("blah"),
 			Secret:    String("blah"),
@@ -60,7 +60,7 @@ func TestUpdateDestination(t *testing.T) {
 		t.Error(err)
 	}
 
-	cfg, ok := updated.Config.(*S3DestinationConnectorConfig)
+	cfg, ok := updated.Config.(*S3ConnectorConfig)
 	if !ok {
 		t.Errorf("expected destination config to be %T, got %T", cfg, updated.Config)
 	}
diff --git a/embedder.go b/embedder.go
new file mode 100644
index 0000000..77873b7
--- /dev/null
+++ b/embedder.go
@@ -0,0 +1,162 @@
+package unstructured
+
+import (
+	"encoding/json"
+	"fmt"
+)
+
+// EmbedderSubtype is a type that represents an embedder subtype.
+type EmbedderSubtype string
+
+// EmbedderSubtype constants.
+const (
+	EmbedderSubtypeAzureOpenAI EmbedderSubtype = "azure_openai"
+	EmbedderSubtypeBedrock     EmbedderSubtype = "bedrock"
+	EmbedderSubtypeTogetherAI  EmbedderSubtype = "togetherai"
+	EmbedderSubtypeVoyageAI    EmbedderSubtype = "voyageai"
+)
+
+// EmbedderModel is a type that represents an embedder model.
+type EmbedderModel string
+
+// EmbedderModel constants for Azure OpenAI.
+const (
+	EmbedderModelAzureOpenAITextEmbedding3Small EmbedderModel = "text-embedding-3-small"
+	EmbedderModelAzureOpenAITextEmbedding3Large EmbedderModel = "text-embedding-3-large"
+	EmbedderModelAzureOpenAITextEmbeddingAda002 EmbedderModel = "text-embedding-ada-002"
+)
+
+// EmbedderModel constants for Bedrock.
+const (
+	EmbedderModelBedrockTitanEmbedTextV2        EmbedderModel = "amazon.titan-embed-text-v2:0"
+	EmbedderModelBedrockTitanEmbedTextV1        EmbedderModel = "amazon.titan-embed-text-v1"
+	EmbedderModelBedrockTitanEmbedImageV1       EmbedderModel = "amazon.titan-embed-image-v1"
+	EmbedderModelBedrockCohereEmbedEnglish      EmbedderModel = "cohere.embed-english-v3"
+	EmbedderModelBedrockCohereEmbedMultilingual EmbedderModel = "cohere.embed-multilingual-v3"
+)
+
+// EmbedderModel constants for TogetherAI.
+const (
+	EmbedderModelTogetherAIM2Bert80M32kRetrieval EmbedderModel = "togethercomputer/m2-bert-80M-32k-retrieval"
+)
+
+// EmbedderModel constants for VoyageAI.
+const (
+	EmbedderModelVoyageAI3           EmbedderModel = "voyage-3"
+	EmbedderModelVoyageAI3Large      EmbedderModel = "voyage-3-large"
+	EmbedderModelVoyageAI3Lite       EmbedderModel = "voyage-3-lite"
+	EmbedderModelVoyageAICode3       EmbedderModel = "voyage-code-3"
+	EmbedderModelVoyageAIFinance2    EmbedderModel = "voyage-finance-2"
+	EmbedderModelVoyageAILaw2        EmbedderModel = "voyage-law-2"
+	EmbedderModelVoyageAICode2       EmbedderModel = "voyage-code-2"
+	EmbedderModelVoyageAIMultimodal3 EmbedderModel = "voyage-multimodal-3"
+)
+
+// Embedder represents an embedding node in a workflow.
+type Embedder struct {
+	ID        string          `json:"-"`
+	Name      string          `json:"-"`
+	Subtype   EmbedderSubtype `json:"-"`
+	ModelName EmbedderModel   `json:"model_name"`
+}
+
+var _ WorkflowNode = new(Embedder)
+
+// isNode implements the WorkflowNode interface.
+func (e Embedder) isNode() {}
+
+// MarshalJSON implements the json.Marshaler interface.
+func (e Embedder) MarshalJSON() ([]byte, error) {
+	data, err := json.Marshal(struct {
+		ModelName EmbedderModel `json:"model_name"`
+	}{
+		ModelName: e.ModelName,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal embedder settings: %w", err)
+	}
+
+	header, err := json.Marshal(header{
+		ID:       e.ID,
+		Name:     e.Name,
+		Type:     nodeTypeEmbed,
+		Subtype:  string(e.Subtype),
+		Settings: json.RawMessage(data),
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal embedder header: %w", err)
+	}
+
+	return header, nil
+}
+
+// ValidateModel validates that the model is compatible with the subtype.
+func (e *Embedder) ValidateModel() error {
+	switch e.Subtype {
+	case EmbedderSubtypeAzureOpenAI:
+		switch e.ModelName {
+		case EmbedderModelAzureOpenAITextEmbedding3Small,
+			EmbedderModelAzureOpenAITextEmbedding3Large,
+			EmbedderModelAzureOpenAITextEmbeddingAda002:
+			return nil
+		default:
+			return fmt.Errorf("invalid model %s for Azure OpenAI embedder", e.ModelName)
+		}
+
+	case EmbedderSubtypeBedrock:
+		switch e.ModelName {
+		case EmbedderModelBedrockTitanEmbedTextV2,
+			EmbedderModelBedrockTitanEmbedTextV1,
+			EmbedderModelBedrockTitanEmbedImageV1,
+			EmbedderModelBedrockCohereEmbedEnglish,
+			EmbedderModelBedrockCohereEmbedMultilingual:
+			return nil
+		default:
+			return fmt.Errorf("invalid model %s for Bedrock embedder", e.ModelName)
+		}
+
+	case EmbedderSubtypeTogetherAI:
+		switch e.ModelName {
+		case EmbedderModelTogetherAIM2Bert80M32kRetrieval:
+			return nil
+		default:
+			return fmt.Errorf("invalid model %s for TogetherAI embedder", e.ModelName)
+		}
+
+	case EmbedderSubtypeVoyageAI:
+		switch e.ModelName {
+		case EmbedderModelVoyageAI3,
+			EmbedderModelVoyageAI3Large,
+			EmbedderModelVoyageAI3Lite,
+			EmbedderModelVoyageAICode3,
+			EmbedderModelVoyageAIFinance2,
+			EmbedderModelVoyageAILaw2,
+			EmbedderModelVoyageAICode2,
+			EmbedderModelVoyageAIMultimodal3:
+			return nil
+		default:
+			return fmt.Errorf("invalid model %s for VoyageAI embedder", e.ModelName)
+		}
+
+	default:
+		return fmt.Errorf("unknown embedder subtype: %s", e.Subtype)
+	}
+}
+
+func unmarshalEmbedder(header header) (WorkflowNode, error) {
+	embedder := &Embedder{
+		ID:      header.ID,
+		Name:    header.Name,
+		Subtype: EmbedderSubtype(header.Subtype),
+	}
+
+	if err := json.Unmarshal(header.Settings, embedder); err != nil {
+		return nil, fmt.Errorf("failed to unmarshal embedder node: %w", err)
+	}
+
+	if err := embedder.ValidateModel(); err != nil {
+		return nil, fmt.Errorf("invalid embedder configuration: %w", err)
+	}
+
+	return embedder, nil
+}
diff --git a/embedder_test.go b/embedder_test.go
new file mode 100644
index 0000000..0fbdbb2
--- /dev/null
+++ b/embedder_test.go
@@ -0,0 +1,184 @@
+package unstructured
+
+import (
+	"encoding/json"
+	"testing"
+)
+
+func TestEmbedder_MarshalJSON(t *testing.T) {
+	tests := []struct {
+		name     string
+		embedder *Embedder
+		wantErr  bool
+	}{
+		{
+			name: "Azure OpenAI embedder",
+			embedder: &Embedder{
+				ID:        "test-id",
+				Name:      "Test Embedder",
+				Subtype:   EmbedderSubtypeAzureOpenAI,
+				ModelName: EmbedderModelAzureOpenAITextEmbedding3Small,
+			},
+			wantErr: false,
+		},
+		{
+			name: "Bedrock embedder",
+			embedder: &Embedder{
+				ID:        "test-id",
+				Name:      "Test Embedder",
+				Subtype:   EmbedderSubtypeBedrock,
+				ModelName: EmbedderModelBedrockTitanEmbedTextV2,
+			},
+			wantErr: false,
+		},
+		{
+			name: "TogetherAI embedder",
+			embedder: &Embedder{
+				ID:        "test-id",
+				Name:      "Test Embedder",
+				Subtype:   EmbedderSubtypeTogetherAI,
+				ModelName: EmbedderModelTogetherAIM2Bert80M32kRetrieval,
+			},
+			wantErr: false,
+		},
+		{
+			name: "VoyageAI embedder",
+			embedder: &Embedder{
+				ID:        "test-id",
+				Name:      "Test Embedder",
+				Subtype:   EmbedderSubtypeVoyageAI,
+				ModelName: EmbedderModelVoyageAI3,
+			},
+			wantErr: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			data, err := json.Marshal(tt.embedder)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("Embedder.MarshalJSON() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+
+			if !tt.wantErr {
+				// Verify the JSON structure
+				var result map[string]interface{}
+				if err := json.Unmarshal(data, &result); err != nil {
+					t.Errorf("Failed to unmarshal result: %v", err)
+					return
+				}
+
+				// Check required fields
+				if result["type"] != "embed" {
+					t.Errorf("Expected type 'embed', got %v", result["type"])
+				}
+
+				if result["subtype"] != string(tt.embedder.Subtype) {
+					t.Errorf("Expected subtype %s, got %v", tt.embedder.Subtype, result["subtype"])
+				}
+
+				// Check settings
+				settings, ok := result["settings"].(map[string]interface{})
+				if !ok {
+					t.Errorf("Settings not found or not an object")
+					return
+				}
+
+				if settings["model_name"] != string(tt.embedder.ModelName) {
+					t.Errorf("Expected model_name %s, got %v", tt.embedder.ModelName, settings["model_name"])
+				}
+			}
+		})
+	}
+}
+
+func TestEmbedder_ValidateModel(t *testing.T) {
+	tests := []struct {
+		name     string
+		embedder *Embedder
+		wantErr  bool
+	}{
+		{
+			name: "Valid Azure OpenAI model",
+			embedder: &Embedder{
+				Subtype:   EmbedderSubtypeAzureOpenAI,
+				ModelName: EmbedderModelAzureOpenAITextEmbedding3Small,
+			},
+			wantErr: false,
+		},
+		{
+			name: "Invalid Azure OpenAI model",
+			embedder: &Embedder{
+				Subtype:   EmbedderSubtypeAzureOpenAI,
+				ModelName: "invalid-model",
+			},
+			wantErr: true,
+		},
+		{
+			name: "Valid Bedrock model",
+			embedder: &Embedder{
+				Subtype:   EmbedderSubtypeBedrock,
+				ModelName: EmbedderModelBedrockTitanEmbedTextV2,
+			},
+			wantErr: false,
+		},
+		{
+			name: "Invalid Bedrock model",
+			embedder: &Embedder{
+				Subtype:   EmbedderSubtypeBedrock,
+				ModelName: "invalid-model",
+			},
+			wantErr: true,
+		},
+		{
+			name: "Valid TogetherAI model",
+			embedder: &Embedder{
+				Subtype:   EmbedderSubtypeTogetherAI,
+				ModelName: EmbedderModelTogetherAIM2Bert80M32kRetrieval,
+			},
+			wantErr: false,
+		},
+		{
+			name: "Invalid TogetherAI model",
+			embedder: &Embedder{
+				Subtype:   EmbedderSubtypeTogetherAI,
+				ModelName: "invalid-model",
+			},
+			wantErr: true,
+		},
+		{
+			name: "Valid VoyageAI model",
+			embedder: &Embedder{
+				Subtype:   EmbedderSubtypeVoyageAI,
+				ModelName: EmbedderModelVoyageAI3,
+			},
+			wantErr: false,
+		},
+		{
+			name: "Invalid VoyageAI model",
+			embedder: &Embedder{
+				Subtype:   EmbedderSubtypeVoyageAI,
+				ModelName: "invalid-model",
+			},
+			wantErr: true,
+		},
+		{
+			name: "Unknown subtype",
+			embedder: &Embedder{
+				Subtype:   "unknown",
+				ModelName: EmbedderModelVoyageAI3,
+			},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := tt.embedder.ValidateModel()
+			if (err != nil) != tt.wantErr {
+				t.Errorf("Embedder.ValidateModel() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
+	}
+}
diff --git a/enricher.go b/enricher.go
index 6ba2806..6193b7c 100644
--- a/enricher.go
+++ b/enricher.go
@@ -3,6 +3,7 @@ package unstructured
 import (
 	"encoding/json"
 	"fmt"
+	"strings"
 )
 
 // Enricher is a node that enriches text.
@@ -33,7 +34,10 @@ const (
 
 var _ WorkflowNode = new(Enricher)
 
-func (e Enricher) isNode() {}
+func (e Enricher) isNode()       {}
+func (e Enricher) isImage() bool { return strings.Contains(string(e.Subtype), "image") }
+func (e Enricher) isTable() bool { return strings.Contains(string(e.Subtype), "table") }
+func (e Enricher) isNER() bool   { return strings.Contains(string(e.Subtype), "ner") }
 
 // MarshalJSON implements the json.Marshaler interface.
 func (e Enricher) MarshalJSON() ([]byte, error) {
diff --git a/shared_config.go b/shared_config.go
index e676e52..0549118 100644
--- a/shared_config.go
+++ b/shared_config.go
@@ -37,17 +37,11 @@ const (
 	ConnectorTypeIBMWatsonxS3               = "ibm_watsonx_s3"
 )
 
-// Shared connector config types that work for both source and destination
-
-type sharedconfiginput struct{}
-
-func (s sharedconfiginput) isSourceConfigInput()      {}
-func (s sharedconfiginput) isDestinationConfigInput() {}
-
 // DatabricksVolumesConnectorConfigInput represents the configuration for a Databricks Volumes connector.
 // It contains host details, catalog information, and authentication credentials.
 type DatabricksVolumesConnectorConfigInput struct {
-	sharedconfiginput
+	sourceconfig
+	destinationconfig
 
 	Host         string  `json:"host"`
 	Catalog      string  `json:"catalog"`
@@ -64,7 +58,8 @@ func (c DatabricksVolumesConnectorConfigInput) Type() string { return ConnectorT
 // ElasticsearchConnectorConfigInput represents the configuration for an Elasticsearch connector.
 // It contains host details, index information, and API key authentication.
 type ElasticsearchConnectorConfigInput struct {
-	sharedconfiginput
+	sourceconfig
+	destinationconfig
 
 	Hosts     []string `json:"hosts"`
 	IndexName string   `json:"index_name"`
@@ -77,7 +72,8 @@ func (c ElasticsearchConnectorConfigInput) Type() string { return ConnectorTypeE
 // MongoDBConnectorConfigInput represents the configuration for a MongoDB connector.
 // It contains database connection details and collection information.
 type MongoDBConnectorConfigInput struct {
-	sharedconfiginput
+	sourceconfig
+	destinationconfig
 
 	Database   string `json:"database"`
 	Collection string `json:"collection"`
@@ -87,16 +83,11 @@ type MongoDBConnectorConfigInput struct {
 // Type always returns the connector type identifier for MongoDB: "mongodb".
 func (c MongoDBConnectorConfigInput) Type() string { return ConnectorTypeMongoDB }
 
-// Shared connector config types that work for both source and destination
-type sharedconfig struct{}
-
-func (s sharedconfig) isSourceConfig()      {}
-func (s sharedconfig) isDestinationConfig() {}
-
 // DatabricksVolumesConnectorConfig represents the configuration for a Databricks Volumes connector.
 // It contains host details, catalog information, and authentication credentials.
 type DatabricksVolumesConnectorConfig struct {
-	sharedconfig
+	sourceconfig
+	destinationconfig
 
 	Host         string  `json:"host"`
 	Catalog      string  `json:"catalog"`
@@ -107,22 +98,193 @@ type DatabricksVolumesConnectorConfig struct {
 	ClientID     string  `json:"client_id"`
 }
 
+var _ SourceConfig = (*DatabricksVolumesConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Databricks Volumes: "databricks_volumes".
+func (c DatabricksVolumesConnectorConfig) Type() string { return ConnectorTypeDatabricksVolumes }
+
 // ElasticsearchConnectorConfig represents the configuration for an Elasticsearch connector.
 // It contains host details, index information, and API key authentication.
 type ElasticsearchConnectorConfig struct {
-	sharedconfig
+	sourceconfig
+	destinationconfig
 
 	Hosts     []string `json:"hosts"`
 	IndexName string   `json:"index_name"`
 	ESAPIKey  string   `json:"es_api_key"`
 }
 
+var _ SourceConfig = (*ElasticsearchConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Elasticsearch: "elasticsearch".
+func (c ElasticsearchConnectorConfig) Type() string { return ConnectorTypeElasticsearch }
+
 // MongoDBConnectorConfig represents the configuration for a MongoDB connector.
 // It contains database connection details and collection information.
 type MongoDBConnectorConfig struct {
-	sharedconfig
+	sourceconfig
+	destinationconfig
 
 	Database   string `json:"database"`
 	Collection string `json:"collection"`
 	URI        string `json:"uri"`
 }
+
+var _ SourceConfig = (*MongoDBConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for MongoDB: "mongodb".
+func (c MongoDBConnectorConfig) Type() string { return ConnectorTypeMongoDB }
+
+// CouchbaseConnectorConfig represents the configuration for a Couchbase connector.
+// It contains connection details, bucket information, and authentication credentials.
+type CouchbaseConnectorConfig struct {
+	sourceconfig
+	destinationconfig
+
+	Bucket           string  `json:"bucket"`
+	ConnectionString string  `json:"connection_string"`
+	Scope            *string `json:"scope,omitempty"`
+	Collection       *string `json:"collection,omitempty"`
+	BatchSize        int     `json:"batch_size"`
+	Username         string  `json:"username"`
+	Password         string  `json:"password"`
+	CollectionID     *string `json:"collection_id,omitempty"`
+}
+
+var _ SourceConfig = (*CouchbaseConnectorConfig)(nil)
+var _ DestinationConfig = (*CouchbaseConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Couchbase: "couchbase".
+func (c CouchbaseConnectorConfig) Type() string { return ConnectorTypeCouchbase }
+
+// S3ConnectorConfig represents the configuration for an S3 connector.
+// It supports both AWS S3 and S3-compatible storage services.
+type S3ConnectorConfig struct {
+	sourceconfig
+	destinationconfig
+
+	RemoteURL   string  `json:"remote_url"`
+	Anonymous   *bool   `json:"anonymous,omitempty"`
+	Key         *string `json:"key,omitempty"`
+	Secret      *string `json:"secret,omitempty"`
+	Token       *string `json:"token,omitempty"`
+	EndpointURL *string `json:"endpoint_url,omitempty"`
+	Recursive   *bool   `json:"recursive,omitempty"`
+}
+
+var _ SourceConfig = (*S3ConnectorConfig)(nil)
+var _ DestinationConfig = (*S3ConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for S3: "s3".
+func (c S3ConnectorConfig) Type() string { return ConnectorTypeS3 }
+
+// GCSConnectorConfig represents the configuration for a Google Cloud Storage connector.
+// It contains the remote URL and service account key for authentication.
+type GCSConnectorConfig struct {
+	sourceconfig
+	destinationconfig
+
+	RemoteURL         string `json:"remote_url"`
+	ServiceAccountKey string `json:"service_account_key"`
+	Recursive         *bool  `json:"recursive,omitempty"`
+}
+
+var _ SourceConfig = (*GCSConnectorConfig)(nil)
+var _ DestinationConfig = (*GCSConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for GCS: "gcs".
+func (c GCSConnectorConfig) Type() string { return ConnectorTypeGCS }
+
+// KafkaCloudConnectorConfig represents the configuration for a Kafka Cloud connector.
+// It contains broker details, topic information, and authentication credentials.
+type KafkaCloudConnectorConfig struct {
+	sourceconfig
+	destinationconfig
+
+	BootstrapServers     string  `json:"bootstrap_servers"`
+	Port                 *int    `json:"port,omitempty"`
+	GroupID              *string `json:"group_id,omitempty"`
+	Topic                string  `json:"topic"`
+	KafkaAPIKey          string  `json:"kafka_api_key"`
+	Secret               string  `json:"secret"`
+	NumMessagesToConsume *int    `json:"num_messages_to_consume,omitempty"`
+	BatchSize            *int    `json:"batch_size,omitempty"`
+}
+
+var _ SourceConfig = (*KafkaCloudConnectorConfig)(nil)
+var _ DestinationConfig = (*KafkaCloudConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Kafka Cloud: "kafka-cloud".
+func (c KafkaCloudConnectorConfig) Type() string { return ConnectorTypeKafkaCloud }
+
+// PostgresConnectorConfig represents the configuration for a PostgreSQL connector.
+// It contains database connection details and table configuration.
+type PostgresConnectorConfig struct {
+	sourceconfig
+	destinationconfig
+
+	Host      string   `json:"host"`
+	Database  string   `json:"database"`
+	Port      int      `json:"port"`
+	Username  string   `json:"username"`
+	Password  string   `json:"password"`
+	TableName string   `json:"table_name"`
+	BatchSize int      `json:"batch_size"`
+	IDColumn  *string  `json:"id_column,omitempty"`
+	Fields    []string `json:"fields,omitempty"`
+}
+
+var _ SourceConfig = (*PostgresConnectorConfig)(nil)
+var _ DestinationConfig = (*PostgresConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for PostgreSQL: "postgres".
+func (c PostgresConnectorConfig) Type() string { return ConnectorTypePostgres }
+
+// SnowflakeConnectorConfig represents the configuration for a Snowflake connector.
+// It contains account details, authentication, and table configuration.
+type SnowflakeConnectorConfig struct {
+	sourceconfig
+	destinationconfig
+
+	Account     string   `json:"account"`
+	Role        string   `json:"role"`
+	User        string   `json:"user"`
+	Password    string   `json:"password"`
+	Host        string   `json:"host"`
+	Port        *int     `json:"port,omitempty"`
+	Database    string   `json:"database"`
+	Schema      *string  `json:"schema,omitempty"`
+	TableName   *string  `json:"table_name,omitempty"`
+	BatchSize   *int     `json:"batch_size,omitempty"`
+	IDColumn    *string  `json:"id_column,omitempty"`
+	Fields      []string `json:"fields,omitempty"`
+	RecordIDKey *string  `json:"record_id_key,omitempty"`
+}
+
+var _ SourceConfig = (*SnowflakeConnectorConfig)(nil)
+var _ DestinationConfig = (*SnowflakeConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Snowflake: "snowflake".
+func (c SnowflakeConnectorConfig) Type() string { return ConnectorTypeSnowflake }
+
+// OneDriveConnectorConfig represents the configuration for a OneDrive connector.
+// It contains Microsoft Graph API authentication and file access settings.
+type OneDriveConnectorConfig struct {
+	sourceconfig
+	destinationconfig
+
+	ClientID     string  `json:"client_id"`
+	UserPName    string  `json:"user_pname"`
+	Tenant       string  `json:"tenant"`
+	AuthorityURL string  `json:"authority_url"`
+	ClientCred   string  `json:"client_cred"`
+	Recursive    *bool   `json:"recursive,omitempty"`
+	Path         *string `json:"path,omitempty"`
+	RemoteURL    *string `json:"remote_url,omitempty"`
+}
+
+var _ SourceConfig = (*OneDriveConnectorConfig)(nil)
+var _ DestinationConfig = (*OneDriveConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for OneDrive: "onedrive".
+func (c OneDriveConnectorConfig) Type() string { return ConnectorTypeOneDrive }
diff --git a/source.go b/source.go
index d3ff3e0..9af5b0c 100644
--- a/source.go
+++ b/source.go
@@ -12,23 +12,23 @@ var sourceConfigFactories = map[string]func() SourceConfig{
 	ConnectorTypeAzure:             func() SourceConfig { return new(AzureSourceConnectorConfig) },
 	ConnectorTypeBox:               func() SourceConfig { return new(BoxSourceConnectorConfig) },
 	ConnectorTypeConfluence:        func() SourceConfig { return new(ConfluenceSourceConnectorConfig) },
-	ConnectorTypeCouchbase:         func() SourceConfig { return new(CouchbaseSourceConnectorConfig) },
+	ConnectorTypeCouchbase:         func() SourceConfig { return new(CouchbaseConnectorConfig) },
 	ConnectorTypeDatabricksVolumes: func() SourceConfig { return new(DatabricksVolumesConnectorConfig) },
 	ConnectorTypeDropbox:           func() SourceConfig { return new(DropboxSourceConnectorConfig) },
 	ConnectorTypeElasticsearch:     func() SourceConfig { return new(ElasticsearchConnectorConfig) },
-	ConnectorTypeGCS:               func() SourceConfig { return new(GCSSourceConnectorConfig) },
+	ConnectorTypeGCS:               func() SourceConfig { return new(GCSConnectorConfig) },
 	ConnectorTypeGoogleDrive:       func() SourceConfig { return new(GoogleDriveSourceConnectorConfig) },
 	ConnectorTypeJira:              func() SourceConfig { return new(JiraSourceConnectorConfig) },
-	ConnectorTypeKafkaCloud:        func() SourceConfig { return new(KafkaCloudSourceConnectorConfig) },
+	ConnectorTypeKafkaCloud:        func() SourceConfig { return new(KafkaCloudConnectorConfig) },
 	ConnectorTypeMongoDB:           func() SourceConfig { return new(MongoDBConnectorConfig) },
-	ConnectorTypeOneDrive:          func() SourceConfig { return new(OneDriveSourceConnectorConfig) },
+	ConnectorTypeOneDrive:          func() SourceConfig { return new(OneDriveConnectorConfig) },
 	ConnectorTypeOutlook:           func() SourceConfig { return new(OutlookSourceConnectorConfig) },
-	ConnectorTypePostgres:          func() SourceConfig { return new(PostgresSourceConnectorConfig) },
-	ConnectorTypeS3:                func() SourceConfig { return new(S3SourceConnectorConfig) },
+	ConnectorTypePostgres:          func() SourceConfig { return new(PostgresConnectorConfig) },
+	ConnectorTypeS3:                func() SourceConfig { return new(S3ConnectorConfig) },
 	ConnectorTypeSalesforce:        func() SourceConfig { return new(SalesforceSourceConnectorConfig) },
 	ConnectorTypeSharePoint:        func() SourceConfig { return new(SharePointSourceConnectorConfig) },
 	ConnectorTypeSlack:             func() SourceConfig { return new(SlackSourceConnectorConfig) },
-	ConnectorTypeSnowflake:         func() SourceConfig { return new(SnowflakeSourceConnectorConfig) },
+	ConnectorTypeSnowflake:         func() SourceConfig { return new(SnowflakeConnectorConfig) },
 	ConnectorTypeZendesk:           func() SourceConfig { return new(ZendeskSourceConnectorConfig) },
 }
 
@@ -87,6 +87,7 @@ func (s *Source) UnmarshalJSON(data []byte) error {
 // It provides a way to identify and work with different source connector types.
 type SourceConfig interface {
 	isSourceConfig()
+	Type() string
 }
 
 type sourceconfig struct{}
@@ -103,18 +104,29 @@ type AzureSourceConnectorConfig struct {
 	AccountKey       *string `json:"account_key,omitempty"`
 	ConnectionString *string `json:"connection_string,omitempty"`
 	SASToken         *string `json:"sas_token,omitempty"`
-	Recursive        bool    `json:"recursive"`
+	Recursive        *bool   `json:"recursive,omitempty"`
 }
 
+var _ SourceConfig = (*AzureSourceConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Azure: "azure".
+func (c AzureSourceConnectorConfig) Type() string { return ConnectorTypeAzure }
+
 // BoxSourceConnectorConfig represents the configuration for a Box source connector.
 // It contains Box app configuration and file access settings.
 type BoxSourceConnectorConfig struct {
 	sourceconfig
 
 	BoxAppConfig string `json:"box_app_config"`
-	Recursive    bool   `json:"recursive"`
+	RemoteURL    string `json:"remote_url"`
+	Recursive    *bool  `json:"recursive,omitempty"`
 }
 
+var _ SourceConfig = (*BoxSourceConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Box: "box".
+func (c BoxSourceConnectorConfig) Type() string { return ConnectorTypeBox }
+
 // ConfluenceSourceConnectorConfig represents the configuration for a Confluence source connector.
 // It contains authentication details and content extraction settings.
 type ConfluenceSourceConnectorConfig struct {
@@ -125,28 +137,18 @@ type ConfluenceSourceConnectorConfig struct {
 	Password                  *string  `json:"password,omitempty"`
 	APIToken                  *string  `json:"api_token,omitempty"`
 	Token                     *string  `json:"token,omitempty"`
-	Cloud                     bool     `json:"cloud"`
+	Cloud                     *bool    `json:"cloud,omitempty"`
 	ExtractImages             *bool    `json:"extract_images,omitempty"`
 	ExtractFiles              *bool    `json:"extract_files,omitempty"`
-	MaxNumOfSpaces            int      `json:"max_num_of_spaces"`
-	MaxNumOfDocsFromEachSpace int      `json:"max_num_of_docs_from_each_space"`
-	Spaces                    []string `json:"spaces"`
+	MaxNumOfSpaces            *int     `json:"max_num_of_spaces,omitempty"`
+	MaxNumOfDocsFromEachSpace *int     `json:"max_num_of_docs_from_each_space,omitempty"`
+	Spaces                    []string `json:"spaces,omitempty"`
 }
 
-// CouchbaseSourceConnectorConfig represents the configuration for a Couchbase source connector.
-// It contains connection details, bucket information, and authentication credentials.
-type CouchbaseSourceConnectorConfig struct {
-	sourceconfig
+var _ SourceConfig = (*ConfluenceSourceConnectorConfig)(nil)
 
-	Bucket           string  `json:"bucket"`
-	ConnectionString string  `json:"connection_string"`
-	Scope            *string `json:"scope,omitempty"`
-	Collection       *string `json:"collection,omitempty"`
-	BatchSize        int     `json:"batch_size"`
-	Username         string  `json:"username"`
-	Password         string  `json:"password"`
-	CollectionID     string  `json:"collection_id"`
-}
+// Type always returns the connector type identifier for Confluence: "confluence".
+func (c ConfluenceSourceConnectorConfig) Type() string { return ConnectorTypeConfluence }
 
 // JiraSourceConnectorConfig represents the configuration for a Jira source connector.
 // It contains authentication details and project/issue filtering settings.
@@ -165,35 +167,10 @@ type JiraSourceConnectorConfig struct {
 	DownloadAttachments *bool    `json:"download_attachments,omitempty"`
 }
 
-// PostgresSourceConnectorConfig represents the configuration for a PostgreSQL source connector.
-// It contains database connection details and table configuration.
-type PostgresSourceConnectorConfig struct {
-	sourceconfig
+var _ SourceConfig = (*JiraSourceConnectorConfig)(nil)
 
-	Host      string   `json:"host"`
-	Database  string   `json:"database"`
-	Port      int      `json:"port"`
-	Username  string   `json:"username"`
-	Password  string   `json:"password"`
-	TableName string   `json:"table_name"`
-	BatchSize int      `json:"batch_size"`
-	IDColumn  string   `json:"id_column"`
-	Fields    []string `json:"fields"`
-}
-
-// S3SourceConnectorConfig represents the configuration for an Amazon S3 source connector.
-// It supports both AWS S3 and S3-compatible storage services for ingesting data.
-type S3SourceConnectorConfig struct {
-	sourceconfig
-
-	RemoteURL   string  `json:"remote_url"`
-	Anonymous   bool    `json:"anonymous"`
-	Key         *string `json:"key,omitempty"`
-	Secret      *string `json:"secret,omitempty"`
-	Token       *string `json:"token,omitempty"`
-	EndpointURL *string `json:"endpoint_url,omitempty"`
-	Recursive   bool    `json:"recursive"`
-}
+// Type always returns the connector type identifier for Jira: "jira".
+func (c JiraSourceConnectorConfig) Type() string { return ConnectorTypeJira }
 
 // SharePointSourceConnectorConfig represents the configuration for a SharePoint source connector.
 // It contains Microsoft Graph API authentication and site access details.
@@ -206,28 +183,14 @@ type SharePointSourceConnectorConfig struct {
 	UserPName    string  `json:"user_pname"`
 	ClientID     string  `json:"client_id"`
 	ClientCred   string  `json:"client_cred"`
-	Recursive    bool    `json:"recursive"`
+	Recursive    *bool   `json:"recursive,omitempty"`
 	Path         *string `json:"path,omitempty"`
 }
 
-// SnowflakeSourceConnectorConfig represents the configuration for a Snowflake source connector.
-// It contains account details, authentication, and table configuration.
-type SnowflakeSourceConnectorConfig struct {
-	sourceconfig
+var _ SourceConfig = (*SharePointSourceConnectorConfig)(nil)
 
-	Account   string   `json:"account"`
-	Role      string   `json:"role"`
-	User      string   `json:"user"`
-	Password  string   `json:"password"`
-	Host      string   `json:"host"`
-	Port      *int     `json:"port,omitempty"`
-	Database  string   `json:"database"`
-	Schema    *string  `json:"schema,omitempty"`
-	TableName *string  `json:"table_name,omitempty"`
-	BatchSize *int     `json:"batch_size,omitempty"`
-	IDColumn  *string  `json:"id_column,omitempty"`
-	Fields    []string `json:"fields,omitempty"`
-}
+// Type always returns the connector type identifier for SharePoint: "sharepoint".
+func (c SharePointSourceConnectorConfig) Type() string { return ConnectorTypeSharePoint }
 
 // DropboxSourceConnectorConfig represents the configuration for a Dropbox source connector.
 // It contains access token and file path configuration.
@@ -236,18 +199,13 @@ type DropboxSourceConnectorConfig struct {
 
 	Token     string `json:"token"`
 	RemoteURL string `json:"remote_url"`
-	Recursive bool   `json:"recursive"`
+	Recursive *bool  `json:"recursive,omitempty"`
 }
 
-// GCSSourceConnectorConfig represents the configuration for a Google Cloud Storage source connector.
-// It contains the remote URL and service account key for authentication.
-type GCSSourceConnectorConfig struct {
-	sourceconfig
+var _ SourceConfig = (*DropboxSourceConnectorConfig)(nil)
 
-	RemoteURL         string `json:"remote_url"`
-	ServiceAccountKey string `json:"service_account_key"`
-	Recursive         bool   `json:"recursive"`
-}
+// Type always returns the connector type identifier for Dropbox: "dropbox".
+func (c DropboxSourceConnectorConfig) Type() string { return ConnectorTypeDropbox }
 
 // GoogleDriveSourceConnectorConfig represents the configuration for a Google Drive source connector.
 // It contains drive ID, service account key, and file filtering settings.
@@ -255,38 +213,15 @@ type GoogleDriveSourceConnectorConfig struct {
 	sourceconfig
 
 	DriveID           string   `json:"drive_id"`
-	ServiceAccountKey string   `json:"service_account_key"`
+	ServiceAccountKey *string  `json:"service_account_key,omitempty"`
 	Extensions        []string `json:"extensions,omitempty"`
-	Recursive         bool     `json:"recursive"`
-}
-
-// KafkaCloudSourceConnectorConfig represents the configuration for a Kafka Cloud source connector.
-// It contains broker details, topic information, and authentication credentials.
-type KafkaCloudSourceConnectorConfig struct {
-	sourceconfig
-
-	BootstrapServers     string  `json:"bootstrap_servers"`
-	Port                 int     `json:"port"`
-	GroupID              *string `json:"group_id,omitempty"`
-	Topic                string  `json:"topic"`
-	KafkaAPIKey          string  `json:"kafka_api_key"`
-	Secret               string  `json:"secret"`
-	NumMessagesToConsume int     `json:"num_messages_to_consume"`
+	Recursive         *bool    `json:"recursive,omitempty"`
 }
 
-// OneDriveSourceConnectorConfig represents the configuration for a OneDrive source connector.
-// It contains Microsoft Graph API authentication and file access settings.
-type OneDriveSourceConnectorConfig struct {
-	sourceconfig
+var _ SourceConfig = (*GoogleDriveSourceConnectorConfig)(nil)
 
-	ClientID     string `json:"client_id"`
-	UserPName    string `json:"user_pname"`
-	Tenant       string `json:"tenant"`
-	AuthorityURL string `json:"authority_url"`
-	ClientCred   string `json:"client_cred"`
-	Recursive    bool   `json:"recursive"`
-	Path         string `json:"path"`
-}
+// Type always returns the connector type identifier for Google Drive: "google_drive".
+func (c GoogleDriveSourceConnectorConfig) Type() string { return ConnectorTypeGoogleDrive }
 
 // OutlookSourceConnectorConfig represents the configuration for an Outlook source connector.
 // It contains Microsoft Graph API authentication and email folder settings.
@@ -298,10 +233,15 @@ type OutlookSourceConnectorConfig struct {
 	ClientID       string   `json:"client_id"`
 	ClientCred     string   `json:"client_cred"`
 	OutlookFolders []string `json:"outlook_folders,omitempty"`
-	Recursive      bool     `json:"recursive"`
+	Recursive      *bool    `json:"recursive,omitempty"`
 	UserEmail      string   `json:"user_email"`
 }
 
+var _ SourceConfig = (*OutlookSourceConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Outlook: "outlook".
+func (c OutlookSourceConnectorConfig) Type() string { return ConnectorTypeOutlook }
+
 // SalesforceSourceConnectorConfig represents the configuration for a Salesforce source connector.
 // It contains authentication details and data category filtering.
 type SalesforceSourceConnectorConfig struct {
@@ -313,6 +253,11 @@ type SalesforceSourceConnectorConfig struct {
 	Categories  []string `json:"categories"`
 }
 
+var _ SourceConfig = (*SalesforceSourceConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Salesforce: "salesforce".
+func (c SalesforceSourceConnectorConfig) Type() string { return ConnectorTypeSalesforce }
+
 // SlackSourceConnectorConfig represents the configuration for a Slack source connector.
 // It contains channel selection, date range filtering, and authentication token.
 type SlackSourceConnectorConfig struct {
@@ -324,6 +269,11 @@ type SlackSourceConnectorConfig struct {
 	Token     string   `json:"token"`
 }
 
+var _ SourceConfig = (*SlackSourceConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Slack: "slack".
+func (c SlackSourceConnectorConfig) Type() string { return ConnectorTypeSlack }
+
 // ZendeskSourceConnectorConfig represents the configuration for a Zendesk source connector.
 // It contains subdomain, authentication, and item type filtering.
 type ZendeskSourceConnectorConfig struct {
@@ -335,3 +285,8 @@ type ZendeskSourceConnectorConfig struct {
 	ItemType  *string `json:"item_type,omitempty"`
 	BatchSize *int    `json:"batch_size,omitempty"`
 }
+
+var _ SourceConfig = (*ZendeskSourceConnectorConfig)(nil)
+
+// Type always returns the connector type identifier for Zendesk: "zendesk".
+func (c ZendeskSourceConnectorConfig) Type() string { return ConnectorTypeZendesk }
diff --git a/source_create.go b/source_create.go
index 364b960..8d9049c 100644
--- a/source_create.go
+++ b/source_create.go
@@ -12,14 +12,7 @@ import (
 // It contains the name and configuration for the source.
 type CreateSourceRequest struct {
 	Name   string
-	Config SourceConfigInput
-}
-
-// SourceConfigInput is an interface that all source connector configurations must implement.
-// It provides a way to identify the type of source connector and marshal its configuration.
-type SourceConfigInput interface {
-	isSourceConfigInput()
-	Type() string
+	Config SourceConfig
 }
 
 // CreateSource creates a new source connector with the specified configuration.
@@ -63,305 +56,3 @@ func (c *Client) CreateSource(ctx context.Context, in CreateSourceRequest) (*Sou
 
 	return &source, nil
 }
-
-type sourceconfiginput struct{}
-
-func (s sourceconfiginput) isSourceConfigInput() {}
-
-// AzureSourceConnectorConfigInput represents the configuration for an Azure Blob Storage source connector.
-// It supports authentication via connection string, account key, or SAS token.
-type AzureSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	RemoteURL        string  `json:"remote_url"`
-	AccountName      *string `json:"account_name,omitempty"`
-	AccountKey       *string `json:"account_key,omitempty"`
-	ConnectionString *string `json:"connection_string,omitempty"`
-	SASToken         *string `json:"sas_token,omitempty"`
-	Recursive        *bool   `json:"recursive,omitempty"`
-}
-
-// Type always returns the connector type identifier for Azure Blob Storage: "azure".
-func (c AzureSourceConnectorConfigInput) Type() string { return ConnectorTypeAzure }
-
-// BoxSourceConnectorConfigInput represents the configuration for a Box source connector.
-// It contains Box app configuration and file access settings.
-type BoxSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	BoxAppConfig string `json:"box_app_config"`
-	RemoteURL    string `json:"remote_url"`
-	Recursive    *bool  `json:"recursive,omitempty"`
-}
-
-// Type always returns the connector type identifier for Box: "box".
-func (c BoxSourceConnectorConfigInput) Type() string { return ConnectorTypeBox }
-
-// ConfluenceSourceConnectorConfigInput represents the configuration for a Confluence source connector.
-// It contains authentication details and content extraction settings.
-type ConfluenceSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	URL                       string   `json:"url"`
-	Username                  string   `json:"username"`
-	Password                  *string  `json:"password,omitempty"`
-	APIToken                  *string  `json:"api_token,omitempty"`
-	Token                     *string  `json:"token,omitempty"`
-	Cloud                     *bool    `json:"cloud,omitempty"`
-	ExtractImages             *bool    `json:"extract_images,omitempty"`
-	ExtractFiles              *bool    `json:"extract_files,omitempty"`
-	MaxNumOfSpaces            *int     `json:"max_num_of_spaces,omitempty"`
-	MaxNumOfDocsFromEachSpace *int     `json:"max_num_of_docs_from_each_space,omitempty"`
-	Spaces                    []string `json:"spaces,omitempty"`
-}
-
-// Type always returns the connector type identifier for Confluence: "confluence".
-func (c ConfluenceSourceConnectorConfigInput) Type() string { return ConnectorTypeConfluence }
-
-// CouchbaseSourceConnectorConfigInput represents the configuration for a Couchbase source connector.
-// It contains connection details, bucket information, and authentication credentials.
-type CouchbaseSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	Bucket           string  `json:"bucket"`
-	ConnectionString string  `json:"connection_string"`
-	Scope            *string `json:"scope,omitempty"`
-	Collection       *string `json:"collection,omitempty"`
-	BatchSize        int     `json:"batch_size"`
-	Username         string  `json:"username"`
-	Password         string  `json:"password"`
-	CollectionID     string  `json:"collection_id"`
-}
-
-// Type always returns the connector type identifier for Couchbase: "couchbase".
-func (c CouchbaseSourceConnectorConfigInput) Type() string { return ConnectorTypeCouchbase }
-
-// DropboxSourceConnectorConfigInput represents the configuration for a Dropbox source connector.
-// It contains access token and file path configuration.
-type DropboxSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	Token     string `json:"token"`
-	RemoteURL string `json:"remote_url"`
-	Recursive *bool  `json:"recursive,omitempty"`
-}
-
-// Type always returns the connector type identifier for Dropbox: "dropbox".
-func (c DropboxSourceConnectorConfigInput) Type() string { return ConnectorTypeDropbox }
-
-// GCSSourceConnectorConfigInput represents the configuration for a Google Cloud Storage source connector.
-// It contains the remote URL and service account key for authentication.
-type GCSSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	RemoteURL         string `json:"remote_url"`
-	ServiceAccountKey string `json:"service_account_key"`
-	Recursive         *bool  `json:"recursive,omitempty"`
-}
-
-// Type always returns the connector type identifier for Google Cloud Storage: "gcs".
-func (c GCSSourceConnectorConfigInput) Type() string { return ConnectorTypeGCS }
-
-// GoogleDriveSourceConnectorConfigInput represents the configuration for a Google Drive source connector.
-// It contains drive ID, service account key, and file filtering settings.
-type GoogleDriveSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	DriveID           string   `json:"drive_id"`
-	ServiceAccountKey *string  `json:"service_account_key,omitempty"`
-	Extensions        []string `json:"extensions,omitempty"`
-	Recursive         *bool    `json:"recursive,omitempty"`
-}
-
-// Type always returns the connector type identifier for Google Drive: "google_drive".
-func (c GoogleDriveSourceConnectorConfigInput) Type() string { return ConnectorTypeGoogleDrive }
-
-// KafkaCloudSourceConnectorConfigInput represents the configuration for a Kafka Cloud source connector.
-// It contains broker details, topic information, and authentication credentials.
-type KafkaCloudSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	BootstrapServers     string  `json:"bootstrap_servers"`
-	Port                 *int    `json:"port,omitempty"`
-	GroupID              *string `json:"group_id,omitempty"`
-	Topic                string  `json:"topic"`
-	KafkaAPIKey          string  `json:"kafka_api_key"`
-	Secret               string  `json:"secret"`
-	NumMessagesToConsume *int    `json:"num_messages_to_consume,omitempty"`
-}
-
-// Type always returns the connector type identifier for Kafka Cloud: "kafka-cloud".
-func (c KafkaCloudSourceConnectorConfigInput) Type() string { return ConnectorTypeKafkaCloud }
-
-// OneDriveSourceConnectorConfigInput represents the configuration for a OneDrive source connector.
-// It contains Microsoft Graph API authentication and file access settings.
-type OneDriveSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	ClientID     string `json:"client_id"`
-	UserPName    string `json:"user_pname"`
-	Tenant       string `json:"tenant"`
-	AuthorityURL string `json:"authority_url"`
-	ClientCred   string `json:"client_cred"`
-	Recursive    *bool  `json:"recursive,omitempty"`
-	Path         string `json:"path"`
-}
-
-// Type always returns the connector type identifier for OneDrive: "onedrive".
-func (c OneDriveSourceConnectorConfigInput) Type() string { return ConnectorTypeOneDrive }
-
-// OutlookSourceConnectorConfigInput represents the configuration for an Outlook source connector.
-// It contains Microsoft Graph API authentication and email folder settings.
-type OutlookSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	AuthorityURL   *string  `json:"authority_url,omitempty"`
-	Tenant         *string  `json:"tenant,omitempty"`
-	ClientID       string   `json:"client_id"`
-	ClientCred     string   `json:"client_cred"`
-	OutlookFolders []string `json:"outlook_folders,omitempty"`
-	Recursive      *bool    `json:"recursive,omitempty"`
-	UserEmail      string   `json:"user_email"`
-}
-
-// Type always returns the connector type identifier for Outlook: "outlook".
-func (c OutlookSourceConnectorConfigInput) Type() string { return ConnectorTypeOutlook }
-
-// PostgresSourceConnectorConfigInput represents the configuration for a PostgreSQL source connector.
-// It contains database connection details and table configuration.
-type PostgresSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	Host      string   `json:"host"`
-	Database  string   `json:"database"`
-	Port      int      `json:"port"`
-	Username  string   `json:"username"`
-	Password  string   `json:"password"`
-	TableName string   `json:"table_name"`
-	BatchSize int      `json:"batch_size"`
-	IDColumn  *string  `json:"id_column,omitempty"`
-	Fields    []string `json:"fields,omitempty"`
-}
-
-// Type always returns the connector type identifier for PostgreSQL: "postgres".
-func (c PostgresSourceConnectorConfigInput) Type() string { return ConnectorTypePostgres }
-
-// S3SourceConnectorConfigInput represents the configuration for an Amazon S3 source connector.
-// It supports both AWS S3 and S3-compatible storage services.
-type S3SourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	RemoteURL   string  `json:"remote_url"`
-	Anonymous   *bool   `json:"anonymous,omitempty"`
-	Key         *string `json:"key,omitempty"`
-	Secret      *string `json:"secret,omitempty"`
-	Token       *string `json:"token,omitempty"`
-	EndpointURL *string `json:"endpoint_url,omitempty"`
-	Recursive   *bool   `json:"recursive,omitempty"`
-}
-
-// Type always returns the connector type identifier for S3: "s3".
-func (c S3SourceConnectorConfigInput) Type() string { return ConnectorTypeS3 }
-
-// SalesforceSourceConnectorConfigInput represents the configuration for a Salesforce source connector.
-// It contains authentication details and data category filtering.
-type SalesforceSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	Username    string   `json:"username"`
-	ConsumerKey string   `json:"consumer_key"`
-	PrivateKey  string   `json:"private_key"`
-	Categories  []string `json:"categories"`
-}
-
-// Type always returns the connector type identifier for Salesforce: "salesforce".
-func (c SalesforceSourceConnectorConfigInput) Type() string { return ConnectorTypeSalesforce }
-
-// SharePointSourceConnectorConfigInput represents the configuration for a SharePoint source connector.
-// It contains Microsoft Graph API authentication and site access details.
-type SharePointSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	Site         string  `json:"site"`
-	Tenant       string  `json:"tenant"`
-	AuthorityURL *string `json:"authority_url,omitempty"`
-	UserPName    string  `json:"user_pname"`
-	ClientID     string  `json:"client_id"`
-	ClientCred   string  `json:"client_cred"`
-	Recursive    *bool   `json:"recursive,omitempty"`
-	Path         *string `json:"path,omitempty"`
-}
-
-// Type always returns the connector type identifier for SharePoint: "sharepoint".
-func (c SharePointSourceConnectorConfigInput) Type() string { return ConnectorTypeSharePoint }
-
-// SlackSourceConnectorConfigInput represents the configuration for a Slack source connector.
-// It contains channel selection, date range filtering, and authentication token.
-type SlackSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	Channels  []string `json:"channels"`
-	StartDate *string  `json:"start_date,omitempty"`
-	EndDate   *string  `json:"end_date,omitempty"`
-	Token     string   `json:"token"`
-}
-
-// Type always returns the connector type identifier for Slack: "slack".
-func (c SlackSourceConnectorConfigInput) Type() string { return ConnectorTypeSlack }
-
-// SnowflakeSourceConnectorConfigInput represents the configuration for a Snowflake source connector.
-// It contains account details, authentication, and table configuration.
-type SnowflakeSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	Account   string   `json:"account"`
-	Role      string   `json:"role"`
-	User      string   `json:"user"`
-	Password  string   `json:"password"`
-	Host      string   `json:"host"`
-	Port      *int     `json:"port,omitempty"`
-	Database  string   `json:"database"`
-	Schema    *string  `json:"schema,omitempty"`
-	TableName *string  `json:"table_name,omitempty"`
-	BatchSize *int     `json:"batch_size,omitempty"`
-	IDColumn  *string  `json:"id_column,omitempty"`
-	Fields    []string `json:"fields,omitempty"`
-}
-
-// Type always returns the connector type identifier for Snowflake: "snowflake".
-func (c SnowflakeSourceConnectorConfigInput) Type() string { return ConnectorTypeSnowflake }
-
-// JiraSourceConnectorConfigInput represents the configuration for a Jira source connector.
-// It contains authentication details and project/issue filtering settings.
-type JiraSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	URL                 string   `json:"url"`
-	Username            string   `json:"username"`
-	Password            *string  `json:"password,omitempty"`
-	Token               *string  `json:"token,omitempty"`
-	Cloud               *bool    `json:"cloud,omitempty"`
-	Projects            []string `json:"projects,omitempty"`
-	Boards              []string `json:"boards,omitempty"`
-	Issues              []string `json:"issues,omitempty"`
-	StatusFilters       []string `json:"status_filters,omitempty"`
-	DownloadAttachments *bool    `json:"download_attachments,omitempty"`
-}
-
-// Type always returns the connector type identifier for Jira: "jira".
-func (c JiraSourceConnectorConfigInput) Type() string { return ConnectorTypeJira }
-
-// ZendeskSourceConnectorConfigInput represents the configuration for a Zendesk source connector.
-// It contains subdomain, authentication, and item type filtering.
-type ZendeskSourceConnectorConfigInput struct {
-	sourceconfiginput
-
-	Subdomain string  `json:"subdomain"`
-	Email     string  `json:"email"`
-	APIToken  string  `json:"api_token"`
-	ItemType  *string `json:"item_type,omitempty"`
-	BatchSize *int    `json:"batch_size,omitempty"`
-}
-
-// Type always returns the connector type identifier for Zendesk: "zendesk".
-func (c ZendeskSourceConnectorConfigInput) Type() string { return ConnectorTypeZendesk }
diff --git a/source_create_test.go b/source_create_test.go
index f2256eb..def6d5d 100644
--- a/source_create_test.go
+++ b/source_create_test.go
@@ -36,13 +36,13 @@ func TestCreateSource(t *testing.T) {
 
 	source, err := client.CreateSource(t.Context(), CreateSourceRequest{
 		Name: "test_source_name",
-		Config: &OneDriveSourceConnectorConfigInput{
+		Config: &OneDriveConnectorConfig{
 			ClientID:     "foo",
 			Tenant:       "foo",
 			AuthorityURL: "foo",
 			UserPName:    "foo",
 			ClientCred:   "foo",
-			Path:         "foo",
+			Path:         String("foo"),
 		},
 	})
 	if err != nil {
@@ -57,7 +57,7 @@ func TestCreateSource(t *testing.T) {
 		t.Error(err)
 	}
 
-	cfg, ok := source.Config.(*OneDriveSourceConnectorConfig)
+	cfg, ok := source.Config.(*OneDriveConnectorConfig)
 	if !ok {
 		t.Errorf("expected source config to be %T, got %T", cfg, source.Config)
 	}
diff --git a/source_get_test.go b/source_get_test.go
index cc14b82..2b3b058 100644
--- a/source_get_test.go
+++ b/source_get_test.go
@@ -54,7 +54,7 @@ func TestGetSource(t *testing.T) {
 		t.Error(err)
 	}
 
-	cfg, ok := source.Config.(*OneDriveSourceConnectorConfig)
+	cfg, ok := source.Config.(*OneDriveConnectorConfig)
 	if !ok {
 		t.Errorf("expected source config to be %T, got %T", cfg, source.Config)
 	}
diff --git a/source_list_test.go b/source_list_test.go
index 3733ac3..bbc3171 100644
--- a/source_list_test.go
+++ b/source_list_test.go
@@ -56,7 +56,7 @@ func TestListSources(t *testing.T) {
 		t.Error(err)
 	}
 
-	cfg, ok := source.Config.(*OneDriveSourceConnectorConfig)
+	cfg, ok := source.Config.(*OneDriveConnectorConfig)
 	if !ok {
 		t.Errorf("expected source config to be %T, got %T", cfg, source.Config)
 	}
diff --git a/source_update.go b/source_update.go
index a7a842a..45d1f5c 100644
--- a/source_update.go
+++ b/source_update.go
@@ -11,7 +11,7 @@ import (
 // UpdateSourceRequest represents the request to update a source connector.
 type UpdateSourceRequest struct {
 	ID     string
-	Config SourceConfigInput
+	Config SourceConfig
 }
 
 // UpdateSource updates the configuration of an existing source connector.
diff --git a/source_update_test.go b/source_update_test.go
index 3bb3bc5..c6ef326 100644
--- a/source_update_test.go
+++ b/source_update_test.go
@@ -45,14 +45,14 @@ func TestUpdateSource(t *testing.T) {
 
 	source, err := client.UpdateSource(t.Context(), UpdateSourceRequest{
 		ID: id,
-		Config: &OneDriveSourceConnectorConfigInput{
+		Config: &OneDriveConnectorConfig{
 			ClientID:     "foo",
 			Tenant:       "foo",
 			AuthorityURL: "foo",
 			UserPName:    "foo",
 			ClientCred:   "foo",
 			Recursive:    Bool(false),
-			Path:         "foo",
+			Path:         String("foo"),
 		},
 	})
 	if err != nil {
@@ -67,7 +67,7 @@ func TestUpdateSource(t *testing.T) {
 		t.Error(err)
 	}
 
-	cfg, ok := source.Config.(*OneDriveSourceConnectorConfig)
+	cfg, ok := source.Config.(*OneDriveConnectorConfig)
 	if !ok {
 		t.Errorf("expected source config to be %T, got %T", cfg, source.Config)
 	}
diff --git a/test/destination_test.go b/test/destination_test.go
index 190cc17..7da437e 100644
--- a/test/destination_test.go
+++ b/test/destination_test.go
@@ -24,20 +24,20 @@ func TestDestinationPermutations(t *testing.T) {
 		t.Fatalf("failed to create client: %v", err)
 	}
 
-	for name, src := range map[string]unstructured.DestinationConfigInput{
-		"astra-db": unstructured.AstraDBConnectorConfigInput{
+	for name, src := range map[string]unstructured.DestinationConfig{
+		"astra-db": &unstructured.AstraDBConnectorConfig{
 			CollectionName: "foo",
 			APIEndpoint:    "https://foo.apps.astra.datastax.com",
 			Token:          "foo",
 		},
 
-		"azure-ai-search": unstructured.AzureAISearchConnectorConfigInput{
+		"azure-ai-search": &unstructured.AzureAISearchConnectorConfig{
 			Endpoint: "https://foo.search.windows.net",
 			Index:    "foo",
 			Key:      "foo",
 		},
 
-		"couchbase": unstructured.CouchbaseDestinationConnectorConfigInput{
+		"couchbase": &unstructured.CouchbaseConnectorConfig{
 			Bucket:           "foo",
 			ConnectionString: "couchbase://foo",
 			Username:         "foo",
@@ -46,7 +46,7 @@ func TestDestinationPermutations(t *testing.T) {
 		},
 
 		// server responds 500
-		// "databricks-volume-delta-table": unstructured.DatabricksVDTDestinationConnectorConfigInput{
+		// "databricks-volume-delta-table": unstructured.DatabricksVDTDestinationConnectorConfig{
 		// 	ServerHostname: "foo.cloud.databricks.com",
 		// 	HTTPPath:       "/sql/1.0/warehouses/foo",
 		// 	Token:          S("foo"),
@@ -54,39 +54,39 @@ func TestDestinationPermutations(t *testing.T) {
 		// 	Volume:         "foo",
 		// },
 
-		"delta-table": unstructured.DeltaTableConnectorConfigInput{
+		"delta-table": &unstructured.DeltaTableConnectorConfig{
 			AwsAccessKeyID:     "foo",
 			AwsSecretAccessKey: "foo",
 			AwsRegion:          "us-east-1",
 			TableURI:           "s3://foo/table",
 		},
 
-		"elasticsearch": unstructured.ElasticsearchConnectorConfigInput{
+		"elasticsearch": &unstructured.ElasticsearchConnectorConfig{
 			Hosts:     []string{"https://foo.elastic-cloud.com"},
 			IndexName: "foo",
 			ESAPIKey:  "foo",
 		},
 
-		"gcs": unstructured.GCSDestinationConnectorConfigInput{
+		"gcs": &unstructured.GCSConnectorConfig{
 			RemoteURL:         "gs://foo",
 			ServiceAccountKey: "foo",
 		},
 
 		// server responds 412 asking for `bootstrap_server` instead of `bootstrap_servers`
-		// "kafka-cloud": unstructured.KafkaCloudDestinationConnectorConfigInput{
+		// "kafka-cloud": unstructured.KafkaCloudDestinationConnectorConfig{
 		// 	BootstrapServers: "foo.cloud.confluent.io",
 		// 	Topic:            "foo",
 		// 	KafkaAPIKey:      "foo",
 		// 	Secret:           "foo",
 		// },
 
-		"milvus-token": unstructured.MilvusDestinationConnectorConfigInput{
+		"milvus-token": &unstructured.MilvusDestinationConnectorConfig{
 			URI:            "https://foo.zilliz.com",
 			CollectionName: "foo",
 			RecordIDKey:    "foo",
 			Token:          S("foo"),
 		},
-		"milvus-password": unstructured.MilvusDestinationConnectorConfigInput{
+		"milvus-password": &unstructured.MilvusDestinationConnectorConfig{
 			URI:            "https://foo.zilliz.com",
 			CollectionName: "foo",
 			RecordIDKey:    "foo",
@@ -94,14 +94,14 @@ func TestDestinationPermutations(t *testing.T) {
 			Password:       S("foo"),
 		},
 
-		"mongo-db": unstructured.MongoDBConnectorConfigInput{
+		"mongo-db": &unstructured.MongoDBConnectorConfig{
 			Database:   "foo",
 			Collection: "foo",
 			URI:        "mongodb://foo:27017/foo",
 		},
 
 		// server responds 422: Destination Connector type motherduck not supported
-		// "mother-duck": unstructured.MotherduckDestinationConnectorConfigInput{
+		// "mother-duck": unstructured.MotherduckDestinationConnectorConfig{
 		// 	Account:  "foo",
 		// 	Role:     "foo",
 		// 	User:     "foo",
@@ -110,29 +110,29 @@ func TestDestinationPermutations(t *testing.T) {
 		// 	Database: "foo",
 		// },
 
-		"neo4j": unstructured.Neo4jDestinationConnectorConfigInput{
+		"neo4j": &unstructured.Neo4jDestinationConnectorConfig{
 			URI:      "bolt://foo:7687",
 			Database: "foo",
 			Username: "foo",
 			Password: "foo",
 		},
 
-		"one-drive": unstructured.OneDriveDestinationConnectorConfigInput{
+		"one-drive": &unstructured.OneDriveConnectorConfig{
 			ClientID:     "foo",
 			UserPName:    "foo",
 			Tenant:       "foo",
 			AuthorityURL: "https://login.microsoftonline.com/foo",
 			ClientCred:   "foo",
-			RemoteURL:    "onedrive://foo",
+			RemoteURL:    S("onedrive://foo"),
 		},
 
-		"pinecone": unstructured.PineconeDestinationConnectorConfigInput{
+		"pinecone": &unstructured.PineconeDestinationConnectorConfig{
 			IndexName: "foo",
 			APIKey:    "foo",
 			Namespace: "foo",
 		},
 
-		"postgres": unstructured.PostgresDestinationConnectorConfigInput{
+		"postgres": &unstructured.PostgresConnectorConfig{
 			Host:      "foo.com",
 			Database:  "foo",
 			Port:      5432,
@@ -142,26 +142,26 @@ func TestDestinationPermutations(t *testing.T) {
 			BatchSize: 100,
 		},
 
-		"redis": unstructured.RedisDestinationConnectorConfigInput{
+		"redis": &unstructured.RedisDestinationConnectorConfig{
 			Host:     "foo.com",
 			Username: S("foo"),
 			Password: S("foo"),
 		},
 
-		"qdrant-cloud": unstructured.QdrantCloudDestinationConnectorConfigInput{
+		"qdrant-cloud": &unstructured.QdrantCloudDestinationConnectorConfig{
 			URL:            "https://foo.qdrant.io",
 			APIKey:         "foo",
 			CollectionName: "foo",
 		},
 
-		"s3": unstructured.S3DestinationConnectorConfigInput{
+		"s3": &unstructured.S3ConnectorConfig{
 			RemoteURL: "s3://foo",
 			Key:       S("foo"),
 			Secret:    S("foo"),
 		},
 
 		// server responds 500
-		// "snowflake": unstructured.SnowflakeDestinationConnectorConfigInput{
+		// "snowflake": unstructured.SnowflakeDestinationConnectorConfig{
 		// 	Account:  "foo",
 		// 	Role:     "foo",
 		// 	User:     "foo",
@@ -170,12 +170,12 @@ func TestDestinationPermutations(t *testing.T) {
 		// 	Database: "foo",
 		// },
 
-		"weaviate-cloud": unstructured.WeaviateDestinationConnectorConfigInput{
+		"weaviate-cloud": &unstructured.WeaviateDestinationConnectorConfig{
 			ClusterURL: "https://foo.weaviate.network",
 			APIKey:     "foo",
 		},
 
-		"ibm-watsonx-s3": unstructured.IBMWatsonxS3DestinationConnectorConfigInput{
+		"ibm-watsonx-s3": &unstructured.IBMWatsonxS3DestinationConnectorConfig{
 			IAMApiKey:             "foo",
 			AccessKeyID:           "foo",
 			SecretAccessKey:       "foo",
@@ -188,7 +188,7 @@ func TestDestinationPermutations(t *testing.T) {
 		},
 
 		// server responds 500
-		// "databricks-volumes": unstructured.DatabricksVolumesConnectorConfigInput{
+		// "databricks-volumes": unstructured.DatabricksVolumesConnectorConfig{
 		// 	Host:         "foo.cloud.databricks.com",
 		// 	Catalog:      "foo",
 		// 	Volume:       "foo",
diff --git a/test/main_test.go b/test/main_test.go
index 9b23979..3185337 100644
--- a/test/main_test.go
+++ b/test/main_test.go
@@ -21,6 +21,7 @@ var B = unstructured.Bool
 
 func TestWorkflow(t *testing.T) {
 	t.Parallel()
+	t.Skip()
 
 	if os.Getenv("UNSTRUCTURED_API_KEY") == "" {
 		t.Skip("skipping because UNSTRUCTURED_API_KEY is not set")
@@ -43,8 +44,7 @@ func TestWorkflow(t *testing.T) {
 	ctx := t.Context()
 
 	workflow, err := client.CreateWorkflow(ctx, &unstructured.CreateWorkflowRequest{
-		Name:         "test",
-		WorkflowType: unstructured.WorkflowTypeCustom,
+		Name: "test",
 		WorkflowNodes: []unstructured.WorkflowNode{
 			&unstructured.PartitionerAuto{
 				Name: "Partitioner",
diff --git a/test/source_test.go b/test/source_test.go
index 92d2614..3c1cd52 100644
--- a/test/source_test.go
+++ b/test/source_test.go
@@ -24,45 +24,51 @@ func TestSourcePermutations(t *testing.T) {
 		t.Fatalf("failed to create client: %v", err)
 	}
 
-	for name, src := range map[string]unstructured.SourceConfigInput{
-		"azure-account-key": unstructured.AzureSourceConnectorConfigInput{
+	for name, src := range map[string]unstructured.SourceConfig{
+		"azure-account-key": &unstructured.AzureSourceConnectorConfig{
 			RemoteURL:   "az://foo",
 			AccountName: S("foo"),
 			AccountKey:  S("foo"),
 		},
-		"azure-connection-string": unstructured.AzureSourceConnectorConfigInput{
+		"azure-connection-string": &unstructured.AzureSourceConnectorConfig{
 			RemoteURL:        "az://foo",
 			ConnectionString: S("foo"),
 		},
-		"azure-sas-token": unstructured.AzureSourceConnectorConfigInput{
+		"azure-sas-token": &unstructured.AzureSourceConnectorConfig{
 			RemoteURL:   "az://foo",
 			AccountName: S("foo"),
 			SASToken:    S("foo"),
 		},
 
-		"box": unstructured.BoxSourceConnectorConfigInput{
+		"box": &unstructured.BoxSourceConnectorConfig{
 			BoxAppConfig: "foo",
 			RemoteURL:    "box://foo",
 		},
 
 		// server responds 500
-		// "confluence": unstructured.ConfluenceSourceConnectorConfigInput{
+		// "confluence-password": unstructured.ConfluenceSourceConnectorConfig{
 		// 	URL:      "https://foo.atlassian.net",
 		// 	Username: "foo",
 		// 	Password: S("foo"),
 		// },
 
-		"couchbase": unstructured.CouchbaseSourceConnectorConfigInput{
+		// "confluence-token": unstructured.ConfluenceSourceConnectorConfig{
+		//	URL:      "https://foo.atlassian.net",
+		//	Username: "foo",
+		//	Token:    S("foo"),
+		// },
+
+		"couchbase": &unstructured.CouchbaseConnectorConfig{
 			Bucket:           "foo",
 			ConnectionString: "couchbase://foo",
 			Username:         "foo",
 			Password:         "foo",
-			CollectionID:     "foo",
+			CollectionID:     S("foo"),
 			BatchSize:        100,
 		},
 
 		// server responds 500
-		// "databricks-volumes": unstructured.DatabricksVolumesConnectorConfigInput{
+		// "databricks-volumes": unstructured.DatabricksVolumesConnectorConfig{
 		// 	Host:         "foo.cloud.databricks.com",
 		// 	Catalog:      "foo",
 		// 	Volume:       "foo",
@@ -71,64 +77,64 @@ func TestSourcePermutations(t *testing.T) {
 		// 	ClientID:     "foo",
 		// },
 
-		"dropbox": unstructured.DropboxSourceConnectorConfigInput{
+		"dropbox": &unstructured.DropboxSourceConnectorConfig{
 			Token:     "foo",
 			RemoteURL: "dropbox://foo",
 		},
 
-		"elasticsearch": unstructured.ElasticsearchConnectorConfigInput{
+		"elasticsearch": &unstructured.ElasticsearchConnectorConfig{
 			Hosts:     []string{"https://foo.elastic-cloud.com"},
 			IndexName: "foo",
 			ESAPIKey:  "foo",
 		},
 
-		"gcs": unstructured.GCSSourceConnectorConfigInput{
+		"gcs": &unstructured.GCSConnectorConfig{
 			RemoteURL:         "gs://foo",
 			ServiceAccountKey: "foo",
 		},
 
-		"google-drive": unstructured.GoogleDriveSourceConnectorConfigInput{
+		"google-drive": &unstructured.GoogleDriveSourceConnectorConfig{
 			DriveID:           "foo",
 			ServiceAccountKey: S("foo"),
 		},
 
-		"jira": unstructured.JiraSourceConnectorConfigInput{
+		"jira": &unstructured.JiraSourceConnectorConfig{
 			URL:      "https://foo.atlassian.net",
 			Username: "foo",
 			Password: S("foo"),
 		},
 
 		// server responds 412 asking for `bootstrap_server` instead of `bootstrap_servers`
-		// "kafka-cloud": unstructured.KafkaCloudSourceConnectorConfigInput{
+		// "kafka-cloud": unstructured.KafkaCloudSourceConnectorConfig{
 		// 	BootstrapServers: "foo.cloud.confluent.io",
 		// 	Topic:            "foo",
 		// 	KafkaAPIKey:      "foo",
 		// 	Secret:           "foo",
 		// },
 
-		"mongodb": unstructured.MongoDBConnectorConfigInput{
+		"mongodb": &unstructured.MongoDBConnectorConfig{
 			Database:   "foo",
 			Collection: "foo",
 			URI:        "mongodb://foo",
 		},
 
-		"onedrive": unstructured.OneDriveSourceConnectorConfigInput{
+		"onedrive": &unstructured.OneDriveConnectorConfig{
 			ClientID:     "foo",
 			UserPName:    "foo",
 			Tenant:       "foo",
 			AuthorityURL: "https://login.microsoftonline.com/foo",
 			ClientCred:   "foo",
-			Path:         "/foo",
+			Path:         S("/foo"),
 		},
 
-		"outlook": unstructured.OutlookSourceConnectorConfigInput{
+		"outlook": &unstructured.OutlookSourceConnectorConfig{
 			ClientID:       "foo",
 			ClientCred:     "foo",
 			UserEmail:      "foo@example.com",
 			OutlookFolders: []string{"Inbox"},
 		},
 
-		"postgres": unstructured.PostgresSourceConnectorConfigInput{
+		"postgres": &unstructured.PostgresConnectorConfig{
 			Host:      "foo.com",
 			Database:  "foo",
 			Port:      5432,
@@ -138,20 +144,20 @@ func TestSourcePermutations(t *testing.T) {
 			BatchSize: 100,
 		},
 
-		"s3": unstructured.S3SourceConnectorConfigInput{
+		"s3": &unstructured.S3ConnectorConfig{
 			RemoteURL: "s3://foo",
 			Key:       S("foo"),
 			Secret:    S("foo"),
 		},
 
-		"salesforce": unstructured.SalesforceSourceConnectorConfigInput{
+		"salesforce": &unstructured.SalesforceSourceConnectorConfig{
 			Username:    "foo",
 			ConsumerKey: "foo",
 			PrivateKey:  "foo",
 			Categories:  []string{"foo"},
 		},
 
-		"sharepoint": unstructured.SharePointSourceConnectorConfigInput{
+		"sharepoint": &unstructured.SharePointSourceConnectorConfig{
 			Site:       "https://foo.sharepoint.com/sites/foo",
 			Tenant:     "foo",
 			UserPName:  "foo",
@@ -160,7 +166,7 @@ func TestSourcePermutations(t *testing.T) {
 		},
 
 		// server responds 500
-		// "snowflake": unstructured.SnowflakeSourceConnectorConfigInput{
+		// "snowflake": unstructured.SnowflakeSourceConnectorConfig{
 		// 	Account:   "foo",
 		// 	Role:      "foo",
 		// 	User:      "foo",
@@ -171,7 +177,7 @@ func TestSourcePermutations(t *testing.T) {
 		// 	IDColumn:  S("foo"),
 		// },
 
-		"zendesk": unstructured.ZendeskSourceConnectorConfigInput{
+		"zendesk": &unstructured.ZendeskSourceConnectorConfig{
 			Subdomain: "foo",
 			Email:     "foo@example.com",
 			APIToken:  "foo",
diff --git a/workflow_create.go b/workflow_create.go
index a75058b..e2674fb 100644
--- a/workflow_create.go
+++ b/workflow_create.go
@@ -13,7 +13,6 @@ type CreateWorkflowRequest struct {
 	Name          string         `json:"name"`
 	SourceID      *string        `json:"source_id,omitempty"`
 	DestinationID *string        `json:"destination_id,omitempty"`
-	WorkflowType  WorkflowType   `json:"workflow_type"`
 	WorkflowNodes []WorkflowNode `json:"workflow_nodes,omitempty"`
 	Schedule      *string        `json:"schedule,omitempty"`
 	ReprocessAll  *bool          `json:"reprocess_all,omitempty"`
@@ -21,7 +20,13 @@ type CreateWorkflowRequest struct {
 
 // CreateWorkflow creates a new workflow
 func (c *Client) CreateWorkflow(ctx context.Context, in *CreateWorkflowRequest) (*Workflow, error) {
-	body, err := json.Marshal(in)
+	body, err := json.Marshal(struct {
+		*CreateWorkflowRequest
+		WorkflowType WorkflowType `json:"workflow_type"`
+	}{
+		CreateWorkflowRequest: in,
+		WorkflowType:          WorkflowTypeCustom,
+	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal workflow request: %w", err)
 	}
diff --git a/workflow_create_test.go b/workflow_create_test.go
index 7efb262..fdd2770 100644
--- a/workflow_create_test.go
+++ b/workflow_create_test.go
@@ -32,7 +32,6 @@ func TestCreateWorkflow(t *testing.T) {
 
 	workflow, err := client.CreateWorkflow(t.Context(), &CreateWorkflowRequest{
 		Name:          "test_workflow",
-		WorkflowType:  WorkflowTypeAdvanced,
 		Schedule:      String("weekly"),
 		SourceID:      String("f1f7b1b2-8e4b-4a2b-8f1d-3e3c7c9e5a3c"),
 		DestinationID: String("aeebecc7-9d8e-4625-bf1d-815c2f084869"),
diff --git a/workflow_node.go b/workflow_node.go
index e01036d..8dd2418 100644
--- a/workflow_node.go
+++ b/workflow_node.go
@@ -12,7 +12,7 @@ type WorkflowNodes []WorkflowNode
 // ValidateNodeOrder validates the order of nodes in a workflow.
 func (w WorkflowNodes) ValidateNodeOrder() (err error) {
 	if len(w) == 0 {
-		return nil
+		return errors.New("first node must be a partitioner")
 	}
 
 	// you have to partition.
@@ -25,8 +25,14 @@ func (w WorkflowNodes) ValidateNodeOrder() (err error) {
 
 	last := nodeTypePartition
 
+	var (
+		didEnrichTable bool
+		didEnrichNER   bool
+		didEnrichImage bool
+	)
+
 	for i, node := range w[1:] {
-		switch node.(type) {
+		switch node := node.(type) {
 		case *PartitionerAuto, *PartitionerVLM, *PartitionerHiRes, *PartitionerFast:
 			err = errors.Join(err, errors.New("only the first node may be a partitioner"))
 
@@ -49,7 +55,7 @@ func (w WorkflowNodes) ValidateNodeOrder() (err error) {
 		case *Enricher:
 			// you can enrich before you chunk...
 			if i == len(w[1:])-1 {
-				err = errors.Join(err, errors.New("enricher must not be the last node"))
+				err = errors.Join(err, fmt.Errorf("%s must not be the last node", nodeTypeEnrich))
 			}
 
 			// and after you partition or enrich.
@@ -57,6 +63,27 @@ func (w WorkflowNodes) ValidateNodeOrder() (err error) {
 				err = errors.Join(err, fmt.Errorf("%s must be after %s or %s", nodeTypeEnrich, nodeTypePartition, nodeTypeEnrich))
 			}
 
+			// you can only have one image enrichment.
+			if node.isImage() && didEnrichImage {
+				err = errors.Join(err, errors.New("only one image enrichment is allowed"))
+			}
+
+			didEnrichImage = node.isImage()
+
+			// you can only have one table enrichment.
+			if node.isTable() && didEnrichTable {
+				err = errors.Join(err, errors.New("only one table enrichment is allowed"))
+			}
+
+			didEnrichTable = node.isTable()
+
+			// you can only have one NER enrichment.
+			if node.isNER() && didEnrichNER {
+				err = errors.Join(err, errors.New("only one NER enrichment is allowed"))
+			}
+
+			didEnrichNER = node.isNER()
+
 			last = nodeTypeEnrich
 
 		default:
@@ -67,11 +94,6 @@ func (w WorkflowNodes) ValidateNodeOrder() (err error) {
 	return err
 }
 
-type (
-	// Embedder represents an embedding node in a workflow.
-	Embedder struct{ WorkflowNode }
-)
-
 // MarshalJSON implements the json.Marshaler interface.
 func (w WorkflowNodes) MarshalJSON() ([]byte, error) {
 	nodes := make([]json.RawMessage, len(w))
@@ -159,5 +181,3 @@ func unmarshalNode(data []byte) (WorkflowNode, error) {
 
 	return nil, fmt.Errorf("unknown node type: %s", header.Type)
 }
-
-func unmarshalEmbedder(_ header) (WorkflowNode, error) { return &Embedder{}, nil }
diff --git a/workflow_node_test.go b/workflow_node_test.go
new file mode 100644
index 0000000..d3700d7
--- /dev/null
+++ b/workflow_node_test.go
@@ -0,0 +1,136 @@
+package unstructured
+
+import (
+	"maps"
+	"strings"
+	"testing"
+)
+
+func TestWorkflowNodeOrder(t *testing.T) {
+	t.Parallel()
+
+	partitioners := map[string]WorkflowNode{
+		"none":            nil,
+		"partition_auto":  &PartitionerAuto{},
+		"partition_vlm":   &PartitionerVLM{},
+		"partition_hires": &PartitionerHiRes{},
+		"partition_fast":  &PartitionerFast{},
+	}
+
+	chunkers := map[string]WorkflowNode{
+		"none":               nil,
+		"chunker_character":  &ChunkerCharacter{},
+		"chunker_title":      &ChunkerTitle{},
+		"chunker_page":       &ChunkerPage{},
+		"chunker_similarity": &ChunkerSimilarity{},
+	}
+
+	enrichers := map[string]*Enricher{
+		"none":                       nil,
+		"enricher_image_openai":      {Subtype: EnrichmentTypeImageOpenAI},
+		"enricher_table_openai":      {Subtype: EnrichmentTypeTableOpenAI},
+		"enricher_table2html_openai": {Subtype: EnrichmentTypeTable2HTMLOpenAI},
+		"enricher_ner_openai":        {Subtype: EnrichmentTypeNEROpenAI},
+		"enricher_image_anthropic":   {Subtype: EnrichmentTypeImageAnthropic},
+		"enricher_table_anthropic":   {Subtype: EnrichmentTypeTableAnthropic},
+		"enricher_ner_anthropic":     {Subtype: EnrichmentTypeNERAnthropic},
+		"enricher_image_bedrock":     {Subtype: EnrichmentTypeImageBedrock},
+		"enricher_table_bedrock":     {Subtype: EnrichmentTypeTableBedrock},
+	}
+
+	embedders := map[string]WorkflowNode{
+		"none":     nil,
+		"embedder": &Embedder{},
+	}
+
+	type testcase struct {
+		nodes   WorkflowNodes
+		wantErr bool
+	}
+
+	tests := make(map[string]testcase, len(partitioners)*len(chunkers)*len(embedders)*len(enrichers)+4)
+
+	for partitionerName, partitioner := range partitioners {
+		for chunkerName, chunker := range chunkers {
+			for enricherName, enricher := range enrichers {
+				for embedderName, embedder := range embedders {
+					labels := []string{}
+
+					var tc testcase
+
+					tc.wantErr = partitioner == nil
+
+					if partitioner != nil {
+						labels = append(labels, partitionerName)
+
+						tc.nodes = append(tc.nodes, partitioner)
+					}
+
+					if enricher != nil {
+						labels = append(labels, enricherName)
+
+						tc.nodes = append(tc.nodes, enricher)
+
+						tc.wantErr = tc.wantErr || chunker == nil
+					}
+
+					if chunker != nil {
+						labels = append(labels, chunkerName)
+
+						tc.nodes = append(tc.nodes, chunker)
+					}
+
+					if embedder != nil {
+						labels = append(labels, embedderName)
+
+						tc.nodes = append(tc.nodes, embedder)
+
+						tc.wantErr = tc.wantErr || chunker == nil
+					}
+
+					name := strings.Join(labels, "-")
+					if name == "" {
+						name = "none"
+					}
+
+					tests[name] = tc
+				}
+			}
+		}
+	}
+
+	maps.Copy(tests, map[string]testcase{
+		"wrong_order": {
+			nodes:   WorkflowNodes{&PartitionerAuto{}, &ChunkerCharacter{}, &Embedder{}, &Enricher{Subtype: EnrichmentTypeImageOpenAI}},
+			wantErr: true,
+		},
+		"double_image_enricher": {
+			nodes:   WorkflowNodes{&PartitionerAuto{}, &Enricher{Subtype: EnrichmentTypeImageOpenAI}, &Enricher{Subtype: EnrichmentTypeImageAnthropic}, &ChunkerCharacter{}},
+			wantErr: true,
+		},
+		"double_table_enricher": {
+			nodes:   WorkflowNodes{&PartitionerAuto{}, &Enricher{Subtype: EnrichmentTypeTableOpenAI}, &Enricher{Subtype: EnrichmentTypeTableBedrock}, &ChunkerCharacter{}},
+			wantErr: true,
+		},
+		"double_ner_enricher": {
+			nodes:   WorkflowNodes{&PartitionerAuto{}, &Enricher{Subtype: EnrichmentTypeNEROpenAI}, &Enricher{Subtype: EnrichmentTypeNERAnthropic}, &ChunkerCharacter{}},
+			wantErr: true,
+		},
+	})
+
+	for name, test := range tests {
+		t.Run(name, func(t *testing.T) {
+			t.Parallel()
+
+			got := test.nodes.ValidateNodeOrder()
+
+			switch {
+			case !test.wantErr && got != nil:
+				t.Errorf("got\n%v\nwant nil", got)
+
+			case test.wantErr && got == nil:
+				t.Errorf("got nil, want error")
+			}
+		})
+	}
+}

Layer Type	Complexity per Layer	Sequential Operations	Maximum Path Length
Self-Attention	O(n² · d)	O(1)	O(1)
Recurrent	O(n · d²)	O(n)	O(n)
Convolutional	O(k · n · d²)	O(1)	O(logk(n))
Self-Attention (restricted)	O(r · n · d)	O(1)	O(n/r)
Model	BLEU	Training Cost (FLOPs)
	EN-DE	EN-FR	EN-DE	EN-FR
ByteNet [18]	23.75
Deep-Att + PosUnk [39]		39.2		1.0 · 10²⁰
GNMT + RL [38]	24.6	39.92	2.3 · 10¹⁹	1.4 · 10²⁰
ConvS2S [9]	25.16	40.46	9.6 · 10¹⁸	1.5 · 10²⁰
MoE [32]	26.03	40.56	2.0 · 10¹⁹	1.2 · 10²⁰
Deep-Att + PosUnk Ensemble [39]		40.4		8.0 · 10²⁰
GNMT + RL Ensemble [38]	26.30	41.16	1.8 · 10²⁰	1.1 · 10²¹
ConvS2S Ensemble [9]	26.36	41.29	7.7 · 10¹⁹	1.2 · 10²¹
Transformer (base model)	27.3	38.1	3.3 · 10¹⁸
Transformer (big)	28.4	41.8		2.3 · 10¹⁹
N	d_model	d_ff	h	d_k	d_v	P_drop	ε_ls	train steps	PPL (dev)	BLEU (dev)	params ×10⁶
base	6	512	2048	8	64	64	0.1	0.1	100K	4.92	25.8	65
(A)				1	512	512				5.29	24.9
				4	128	128				5.00	25.5
				16	32	32				4.91	25.8
				32	16	16				5.01	25.4
(B)						16				5.16	25.1	58
						32				5.01	25.4	60
(C)	2									6.11	23.7	36
	4									5.19	25.3	50
	8									4.88	25.5	80
		256			32	32				5.75	24.5	28
		1024			128	128				4.66	26.0	168
			1024							5.12	25.4	53
			4096							4.75	26.2	90
(D)							0.0			5.77	24.6
						0.2			4.95	25.5
							0.0		4.67	25.3
							0.2		5.47	25.7
(E)	positional embedding instead of sinusoids	4.92	25.7
big	6	1024	4096	16			0.3		300K	4.33	26.4	213
Parser	Training	WSJ 23 F1
Vinyals & Kaiser et al. (2014) [37]	WSJ only, discriminative	88.3
Petrov et al. (2006) [29]	WSJ only, discriminative	90.4
Zhu et al. (2013) [40]	WSJ only, discriminative	90.4
Dyer et al. (2016) [8]	WSJ only, discriminative	91.7
Transformer (4 layers)	WSJ only, discriminative	91.3
Zhu et al. (2013) [40]	semi-supervised	91.3
Huang & Harper (2009) [14]	semi-supervised	91.3
McClosky et al. (2006) [26]	semi-supervised	92.1
Vinyals & Kaiser et al. (2014) [37]	semi-supervised	92.1
Transformer (4 layers)	semi-supervised	92.7
Luong et al. (2015) [23]	multi-task	93.0
Dyer et al. (2016) [8]	generative	93.3