Skip to content

Commit 0796e6d

Browse files
committed
Add unstructured package components for document processing
- Introduced new types and constants for block types, chunkers, partitioners, and encoders. - Implemented various chunker types (Character, Title, Page, Similarity) and partitioner strategies (Auto, Fast, HiRes, VLM). - Added support for encoding types and excludable elements in document processing. - Enhanced the client with new methods for running workflows and downloading job outputs. - Updated the Go module to version 1.25.0 and added comprehensive test cases for the new functionality.
1 parent dd7d363 commit 0796e6d

29 files changed

+1437
-27
lines changed

block_types.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package unstructured
2+
3+
// BlockType is a type that represents a block type.
4+
type BlockType string
5+
6+
// BlockType constants.
7+
const (
8+
BlockTypeImage BlockType = "Image"
9+
BlockTypeTable BlockType = "Table"
10+
)

chunker_character.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package unstructured
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
)
7+
8+
// ChunkerCharacter is a node that chunks text by character.
9+
type ChunkerCharacter struct {
10+
ID string `json:"-"`
11+
Name string `json:"-"`
12+
APIURL string `json:"unstructured_api_url,omitempty"`
13+
APIKey string `json:"unstructured_api_key,omitempty"`
14+
IncludeOrigElements bool `json:"include_orig_elements,omitempty"`
15+
NewAfterNChars int `json:"new_after_n_chars,omitempty"`
16+
MaxCharacters int `json:"max_characters,omitempty"`
17+
Overlap int `json:"overlap,omitempty"`
18+
OverlapAll bool `json:"overlap_all"`
19+
ContextualChunkingStrategy ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
20+
}
21+
22+
// ChunkingStrategy is a strategy for contextual chunking.
23+
type ChunkingStrategy string
24+
25+
// ChunkingStrategyV1 is a strategy for contextual chunking.
26+
const ChunkingStrategyV1 = "v1"
27+
28+
var _ WorkflowNode = new(ChunkerCharacter)
29+
30+
// isNode implements the WorkflowNode interface.
31+
func (c ChunkerCharacter) isNode() {}
32+
33+
// MarshalJSON implements the json.Marshaler interface.
34+
func (c ChunkerCharacter) MarshalJSON() ([]byte, error) {
35+
type alias ChunkerCharacter
36+
37+
data, err := json.Marshal(alias(c))
38+
if err != nil {
39+
return nil, fmt.Errorf("failed to marshal chunker character: %w", err)
40+
}
41+
42+
headerData, err := json.Marshal(header{
43+
ID: c.ID,
44+
Name: c.Name,
45+
Type: nodeTypeChunk,
46+
Subtype: string(ChunkerSubtypeCharacter),
47+
Settings: json.RawMessage(data),
48+
})
49+
if err != nil {
50+
return nil, fmt.Errorf("failed to marshal chunker character header: %w", err)
51+
}
52+
53+
return headerData, nil
54+
}

chunker_page.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package unstructured
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
)
7+
8+
// ChunkerPage is a node that chunks text by character.
9+
type ChunkerPage struct {
10+
ID string `json:"-"`
11+
Name string `json:"-"`
12+
APIURL string `json:"unstructured_api_url,omitempty"`
13+
APIKey string `json:"unstructured_api_key,omitempty"`
14+
IncludeOrigElements bool `json:"include_orig_elements,omitempty"`
15+
NewAfterNChars int `json:"new_after_n_chars,omitempty"`
16+
MaxCharacters int `json:"max_characters,omitempty"`
17+
Overlap int `json:"overlap,omitempty"`
18+
OverlapAll bool `json:"overlap_all"`
19+
Strategy ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
20+
}
21+
22+
var _ WorkflowNode = new(ChunkerPage)
23+
24+
// isNode implements the WorkflowNode interface.
25+
func (c ChunkerPage) isNode() {}
26+
27+
// MarshalJSON implements the json.Marshaler interface.
28+
func (c ChunkerPage) MarshalJSON() ([]byte, error) {
29+
type alias ChunkerPage
30+
31+
data, err := json.Marshal(alias(c))
32+
if err != nil {
33+
return nil, fmt.Errorf("failed to marshal chunker page: %w", err)
34+
}
35+
36+
headerData, err := json.Marshal(header{
37+
ID: c.ID,
38+
Name: c.Name,
39+
Type: nodeTypeChunk,
40+
Subtype: string(ChunkerSubtypePage),
41+
Settings: json.RawMessage(data),
42+
})
43+
if err != nil {
44+
return nil, fmt.Errorf("failed to marshal chunker page header: %w", err)
45+
}
46+
47+
return headerData, nil
48+
}

chunker_similarity.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package unstructured
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
)
7+
8+
// ChunkerSimilarity is a node that chunks text by character.
9+
type ChunkerSimilarity struct {
10+
ID string `json:"-"`
11+
Name string `json:"-"`
12+
APIURL string `json:"unstructured_api_url,omitempty"`
13+
APIKey string `json:"unstructured_api_key,omitempty"`
14+
IncludeOrigElements bool `json:"include_orig_elements,omitempty"`
15+
NewAfterNChars int `json:"new_after_n_chars,omitempty"`
16+
MaxCharacters int `json:"max_characters,omitempty"`
17+
Overlap int `json:"overlap,omitempty"`
18+
OverlapAll bool `json:"overlap_all"`
19+
Strategy ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
20+
}
21+
22+
var _ WorkflowNode = new(ChunkerSimilarity)
23+
24+
// isNode implements the WorkflowNode interface.
25+
func (c ChunkerSimilarity) isNode() {}
26+
27+
// MarshalJSON implements the json.Marshaler interface.
28+
func (c ChunkerSimilarity) MarshalJSON() ([]byte, error) {
29+
type alias ChunkerSimilarity
30+
31+
data, err := json.Marshal(alias(c))
32+
if err != nil {
33+
return nil, fmt.Errorf("failed to marshal chunker similarity: %w", err)
34+
}
35+
36+
headerData, err := json.Marshal(header{
37+
ID: c.ID,
38+
Name: c.Name,
39+
Type: nodeTypeChunk,
40+
Subtype: string(ChunkerSubtypeSimilarity),
41+
Settings: json.RawMessage(data),
42+
})
43+
if err != nil {
44+
return nil, fmt.Errorf("failed to marshal chunker similarity header: %w", err)
45+
}
46+
47+
return headerData, nil
48+
}

chunker_title.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package unstructured
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
)
7+
8+
// ChunkerTitle is a node that chunks text by character.
9+
type ChunkerTitle struct {
10+
ID string `json:"-"`
11+
Name string `json:"-"`
12+
APIURL string `json:"unstructured_api_url,omitempty"`
13+
APIKey string `json:"unstructured_api_key,omitempty"`
14+
CombineTextUnderN int `json:"combine_text_under_n_chars,omitempty"`
15+
IncludeOrigElements bool `json:"include_orig_elements,omitempty"`
16+
NewAfterNChars int `json:"new_after_n_chars,omitempty"`
17+
MaxCharacters int `json:"max_characters,omitempty"`
18+
Overlap int `json:"overlap,omitempty"`
19+
OverlapAll bool `json:"overlap_all"`
20+
ContextualChunkingStrategy ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
21+
}
22+
23+
var _ WorkflowNode = new(ChunkerTitle)
24+
25+
// isNode implements the WorkflowNode interface.
26+
func (c ChunkerTitle) isNode() {}
27+
28+
// MarshalJSON implements the json.Marshaler interface.
29+
func (c ChunkerTitle) MarshalJSON() ([]byte, error) {
30+
type alias ChunkerTitle
31+
32+
data, err := json.Marshal(alias(c))
33+
if err != nil {
34+
return nil, fmt.Errorf("failed to marshal chunker title: %w", err)
35+
}
36+
37+
headerData, err := json.Marshal(header{
38+
ID: c.ID,
39+
Name: c.Name,
40+
Type: nodeTypeChunk,
41+
Subtype: string(ChunkerSubtypeTitle),
42+
Settings: json.RawMessage(data),
43+
})
44+
if err != nil {
45+
return nil, fmt.Errorf("failed to marshal chunker title header: %w", err)
46+
}
47+
48+
return headerData, nil
49+
}

chunker_type.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
package unstructured
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
)
7+
8+
// ChunkerSubtype is a type that represents a chunker subtype.
9+
type ChunkerSubtype string
10+
11+
// ChunkerSubtype constants.
12+
const (
13+
ChunkerSubtypeCharacter ChunkerSubtype = "chunk_by_character"
14+
ChunkerSubtypeTitle ChunkerSubtype = "chunk_by_title"
15+
ChunkerSubtypePage ChunkerSubtype = "chunk_by_page"
16+
ChunkerSubtypeSimilarity ChunkerSubtype = "chunk_by_similarity"
17+
)
18+
19+
func unmarshalChunker(header header) (WorkflowNode, error) {
20+
var chunker WorkflowNode
21+
22+
switch ChunkerSubtype(header.Subtype) {
23+
case ChunkerSubtypeCharacter:
24+
chunker = &ChunkerCharacter{
25+
ID: header.ID,
26+
Name: header.Name,
27+
}
28+
29+
case ChunkerSubtypeTitle:
30+
chunker = &ChunkerTitle{
31+
ID: header.ID,
32+
Name: header.Name,
33+
}
34+
35+
case ChunkerSubtypePage:
36+
chunker = &ChunkerPage{
37+
ID: header.ID,
38+
Name: header.Name,
39+
}
40+
41+
case ChunkerSubtypeSimilarity:
42+
chunker = &ChunkerSimilarity{
43+
ID: header.ID,
44+
Name: header.Name,
45+
}
46+
47+
default:
48+
return nil, fmt.Errorf("unknown Chunker strategy: %s", header.Subtype)
49+
}
50+
51+
if err := json.Unmarshal(header.Settings, chunker); err != nil {
52+
return nil, fmt.Errorf("failed to unmarshal Chunker node: %w", err)
53+
}
54+
55+
return chunker, nil
56+
}

client.go

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"cmp"
55
"encoding/json"
66
"fmt"
7+
"io"
78
"net/http"
89
"net/url"
910
"os"
@@ -55,6 +56,15 @@ func WithKey(key string) Option {
5556
}
5657
}
5758

59+
// WithClient returns an Option that sets the HTTP client to use for requests.
60+
// If no client is provided, the client will default to [http.DefaultClient].
61+
func WithClient(hc *http.Client) Option {
62+
return func(c *Client) error {
63+
c.hc = hc
64+
return nil
65+
}
66+
}
67+
5868
// New creates a new Client instance with the provided options.
5969
// If the `UNSTRUCTURED_API_KEY` environment variable is set, it will be used as the API key for authentication.
6070
// If the `UNSTRUCTURED_API_URL` environment variable is set to a valid URL, it will be used as the base URL for the Unstructured.io API.
@@ -104,17 +114,20 @@ func (c *Client) do(req *http.Request, out any) error {
104114
defer func() { _ = resp.Body.Close() }()
105115

106116
if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices {
117+
body, err := io.ReadAll(resp.Body)
118+
if err != nil {
119+
return fmt.Errorf("failed to read response body: %w", err)
120+
}
121+
107122
// Handle 422 validation errors specifically
108123
if resp.StatusCode == http.StatusUnprocessableEntity {
109124
var validationErr HTTPValidationError
110-
if err := json.NewDecoder(resp.Body).Decode(&validationErr); err != nil {
111-
return fmt.Errorf("failed to decode validation error response: %w", err)
125+
if err := json.Unmarshal(body, &validationErr); err == nil {
126+
return &validationErr
112127
}
113-
114-
return &validationErr
115128
}
116129

117-
return fmt.Errorf("unsuccessful response: %s", resp.Status)
130+
return fmt.Errorf("[%s]: %s", resp.Status, string(body))
118131
}
119132

120133
if out != nil {

encoding.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package unstructured
2+
3+
import "strings"
4+
5+
// Encoding is a type that represents an encoding.
6+
type Encoding string
7+
8+
// Encoding constants.
9+
const (
10+
EncodingUTF8 Encoding = "utf_8"
11+
EncodingISO88591 Encoding = "iso_8859_1"
12+
EncodingISO88596 Encoding = "iso_8859_6"
13+
EncodingISO88598 Encoding = "iso_8859_8"
14+
EncodingASCII Encoding = "ascii"
15+
EncodingBig5 Encoding = "big5"
16+
EncodingUTF16 Encoding = "utf_16"
17+
EncodingUTF16Be Encoding = "utf_16_be"
18+
EncodingUTF16Le Encoding = "utf_16_le"
19+
EncodingUTF32 Encoding = "utf_32"
20+
EncodingUTF32Be Encoding = "utf_32_be"
21+
EncodingUTF32Le Encoding = "utf_32_le"
22+
EncodingEUCJIS2004 Encoding = "euc_jis_2004"
23+
EncodingEUCJISX0213 Encoding = "euc_jisx0213"
24+
EncodingEUCJP Encoding = "euc_jp"
25+
EncodingEUCKR Encoding = "euc_kr"
26+
EncodingGb18030 Encoding = "gb18030"
27+
EncodingSHIFTJIS Encoding = "shift_jis"
28+
EncodingSHIFTJIS2004 Encoding = "shift_jis_2004"
29+
EncodingSHIFTJISX0213 Encoding = "shift_jisx0213"
30+
)
31+
32+
// String implements the fmt.Stringer interface, canonicalizing the encoding name.
33+
func (e Encoding) String() string {
34+
s := strings.TrimSpace(string(e))
35+
s = strings.ToLower(s)
36+
s = strings.ReplaceAll(s, "_", "-")
37+
38+
switch s {
39+
case "iso_8859_6_i", "iso_8859_8_i",
40+
"iso_8859_6_e", "iso_8859_8_e":
41+
s = s[:len(s)-2]
42+
}
43+
44+
return s
45+
}

0 commit comments

Comments
 (0)