Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,14 @@ linters:
check-type-assertions: false
exclude-functions:
- io/ioutil.ReadFile
- io.Copy(*bytes.Buffer)
- io.Copy(os.Stdout)
- io.Copy
- (io.Closer).Close
- (net/http.ResponseWriter).Write
- io.Writer.Write
gosec:
excludes:
- G101
- G104

# Issues configuration
issues:
Expand Down
5 changes: 4 additions & 1 deletion bearer.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ type bearer struct {
rt http.RoundTripper
}

// HeaderKey is "Unstructured-API-Key", which is the header where Unstructured expects to find the API key.
const HeaderKey = "Unstructured-API-Key"

// RoundTrip implements the http.RoundTripper interface.
func (b *bearer) RoundTrip(req *http.Request) (*http.Response, error) {
req.Header.Set("Unstructured-API-Key", b.key)
req.Header.Set(HeaderKey, b.key)

// This is implementing the http.RoundTripper interface, errors should be passed through as-is
return b.rt.RoundTrip(req) //nolint:wrapcheck
Expand Down
10 changes: 10 additions & 0 deletions block_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package unstructured

// BlockType is a type that represents a block type.
type BlockType string

// BlockType constants.
const (
BlockTypeImage BlockType = "Image"
BlockTypeTable BlockType = "Table"
)
54 changes: 54 additions & 0 deletions chunker_character.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package unstructured

import (
"encoding/json"
"fmt"
)

// ChunkerCharacter is a node that chunks text by character.
type ChunkerCharacter struct {
ID string `json:"-"`
Name string `json:"-"`
APIURL string `json:"unstructured_api_url,omitempty"`
APIKey string `json:"unstructured_api_key,omitempty"`
IncludeOrigElements bool `json:"include_orig_elements,omitempty"`
NewAfterNChars int `json:"new_after_n_chars,omitempty"`
MaxCharacters int `json:"max_characters,omitempty"`
Overlap int `json:"overlap,omitempty"`
OverlapAll bool `json:"overlap_all"`
ContextualChunkingStrategy ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
}

// ChunkingStrategy is a strategy for contextual chunking.
type ChunkingStrategy string

// ChunkingStrategyV1 is a strategy for contextual chunking.
const ChunkingStrategyV1 = "v1"

var _ WorkflowNode = new(ChunkerCharacter)

// isNode implements the WorkflowNode interface.
func (c ChunkerCharacter) isNode() {}

// MarshalJSON implements the json.Marshaler interface.
func (c ChunkerCharacter) MarshalJSON() ([]byte, error) {
type alias ChunkerCharacter

data, err := json.Marshal(alias(c))
if err != nil {
return nil, fmt.Errorf("failed to marshal chunker character: %w", err)
}

headerData, err := json.Marshal(header{
ID: c.ID,
Name: c.Name,
Type: nodeTypeChunk,
Subtype: string(ChunkerSubtypeCharacter),
Settings: json.RawMessage(data),
})
if err != nil {
return nil, fmt.Errorf("failed to marshal chunker character header: %w", err)
}

return headerData, nil
}
48 changes: 48 additions & 0 deletions chunker_page.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package unstructured

import (
"encoding/json"
"fmt"
)

// ChunkerPage is a node that chunks text by character.
type ChunkerPage struct {
ID string `json:"-"`
Name string `json:"-"`
APIURL string `json:"unstructured_api_url,omitempty"`
APIKey string `json:"unstructured_api_key,omitempty"`
IncludeOrigElements bool `json:"include_orig_elements,omitempty"`
NewAfterNChars int `json:"new_after_n_chars,omitempty"`
MaxCharacters int `json:"max_characters,omitempty"`
Overlap int `json:"overlap,omitempty"`
OverlapAll bool `json:"overlap_all"`
Strategy ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
}

var _ WorkflowNode = new(ChunkerPage)

// isNode implements the WorkflowNode interface.
func (c ChunkerPage) isNode() {}

// MarshalJSON implements the json.Marshaler interface.
func (c ChunkerPage) MarshalJSON() ([]byte, error) {
type alias ChunkerPage

data, err := json.Marshal(alias(c))
if err != nil {
return nil, fmt.Errorf("failed to marshal chunker page: %w", err)
}

headerData, err := json.Marshal(header{
ID: c.ID,
Name: c.Name,
Type: nodeTypeChunk,
Subtype: string(ChunkerSubtypePage),
Settings: json.RawMessage(data),
})
if err != nil {
return nil, fmt.Errorf("failed to marshal chunker page header: %w", err)
}

return headerData, nil
}
48 changes: 48 additions & 0 deletions chunker_similarity.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package unstructured

import (
"encoding/json"
"fmt"
)

// ChunkerSimilarity is a node that chunks text by character.
type ChunkerSimilarity struct {
ID string `json:"-"`
Name string `json:"-"`
APIURL string `json:"unstructured_api_url,omitempty"`
APIKey string `json:"unstructured_api_key,omitempty"`
IncludeOrigElements bool `json:"include_orig_elements,omitempty"`
NewAfterNChars int `json:"new_after_n_chars,omitempty"`
MaxCharacters int `json:"max_characters,omitempty"`
Overlap int `json:"overlap,omitempty"`
OverlapAll bool `json:"overlap_all"`
Strategy ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
}

var _ WorkflowNode = new(ChunkerSimilarity)

// isNode implements the WorkflowNode interface.
func (c ChunkerSimilarity) isNode() {}

// MarshalJSON implements the json.Marshaler interface.
func (c ChunkerSimilarity) MarshalJSON() ([]byte, error) {
type alias ChunkerSimilarity

data, err := json.Marshal(alias(c))
if err != nil {
return nil, fmt.Errorf("failed to marshal chunker similarity: %w", err)
}

headerData, err := json.Marshal(header{
ID: c.ID,
Name: c.Name,
Type: nodeTypeChunk,
Subtype: string(ChunkerSubtypeSimilarity),
Settings: json.RawMessage(data),
})
if err != nil {
return nil, fmt.Errorf("failed to marshal chunker similarity header: %w", err)
}

return headerData, nil
}
49 changes: 49 additions & 0 deletions chunker_title.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package unstructured

import (
"encoding/json"
"fmt"
)

// ChunkerTitle is a node that chunks text by character.
type ChunkerTitle struct {
ID string `json:"-"`
Name string `json:"-"`
APIURL string `json:"unstructured_api_url,omitempty"`
APIKey string `json:"unstructured_api_key,omitempty"`
CombineTextUnderN int `json:"combine_text_under_n_chars,omitempty"`
IncludeOrigElements bool `json:"include_orig_elements,omitempty"`
NewAfterNChars int `json:"new_after_n_chars,omitempty"`
MaxCharacters int `json:"max_characters,omitempty"`
Overlap int `json:"overlap,omitempty"`
OverlapAll bool `json:"overlap_all"`
ContextualChunkingStrategy ChunkingStrategy `json:"contextual_chunking_strategy,omitempty"`
}

var _ WorkflowNode = new(ChunkerTitle)

// isNode implements the WorkflowNode interface.
func (c ChunkerTitle) isNode() {}

// MarshalJSON implements the json.Marshaler interface.
func (c ChunkerTitle) MarshalJSON() ([]byte, error) {
type alias ChunkerTitle

data, err := json.Marshal(alias(c))
if err != nil {
return nil, fmt.Errorf("failed to marshal chunker title: %w", err)
}

headerData, err := json.Marshal(header{
ID: c.ID,
Name: c.Name,
Type: nodeTypeChunk,
Subtype: string(ChunkerSubtypeTitle),
Settings: json.RawMessage(data),
})
if err != nil {
return nil, fmt.Errorf("failed to marshal chunker title header: %w", err)
}

return headerData, nil
}
56 changes: 56 additions & 0 deletions chunker_type.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package unstructured

import (
"encoding/json"
"fmt"
)

// ChunkerSubtype is a type that represents a chunker subtype.
type ChunkerSubtype string

// ChunkerSubtype constants.
const (
ChunkerSubtypeCharacter ChunkerSubtype = "chunk_by_character"
ChunkerSubtypeTitle ChunkerSubtype = "chunk_by_title"
ChunkerSubtypePage ChunkerSubtype = "chunk_by_page"
ChunkerSubtypeSimilarity ChunkerSubtype = "chunk_by_similarity"
)

func unmarshalChunker(header header) (WorkflowNode, error) {
var chunker WorkflowNode

switch ChunkerSubtype(header.Subtype) {
case ChunkerSubtypeCharacter:
chunker = &ChunkerCharacter{
ID: header.ID,
Name: header.Name,
}

case ChunkerSubtypeTitle:
chunker = &ChunkerTitle{
ID: header.ID,
Name: header.Name,
}

case ChunkerSubtypePage:
chunker = &ChunkerPage{
ID: header.ID,
Name: header.Name,
}

case ChunkerSubtypeSimilarity:
chunker = &ChunkerSimilarity{
ID: header.ID,
Name: header.Name,
}

default:
return nil, fmt.Errorf("unknown Chunker strategy: %s", header.Subtype)
}

if err := json.Unmarshal(header.Settings, chunker); err != nil {
return nil, fmt.Errorf("failed to unmarshal Chunker node: %w", err)
}

return chunker, nil
}
30 changes: 25 additions & 5 deletions client.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ package unstructured
import (
"cmp"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
Expand Down Expand Up @@ -55,6 +57,15 @@ func WithKey(key string) Option {
}
}

// WithClient returns an Option that sets the HTTP client to use for requests.
// If no client is provided, the client will default to [http.DefaultClient].
func WithClient(hc *http.Client) Option {
return func(c *Client) error {
c.hc = hc
return nil
}
}

// New creates a new Client instance with the provided options.
// If the `UNSTRUCTURED_API_KEY` environment variable is set, it will be used as the API key for authentication.
// If the `UNSTRUCTURED_API_URL` environment variable is set to a valid URL, it will be used as the base URL for the Unstructured.io API.
Expand Down Expand Up @@ -104,17 +115,26 @@ func (c *Client) do(req *http.Request, out any) error {
defer func() { _ = resp.Body.Close() }()

if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices {
body, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read response body: %w", err)
}

// Handle 422 validation errors specifically
if resp.StatusCode == http.StatusUnprocessableEntity {
var validationErr HTTPValidationError
if err := json.NewDecoder(resp.Body).Decode(&validationErr); err != nil {
return fmt.Errorf("failed to decode validation error response: %w", err)
if err := json.Unmarshal(body, &validationErr); err == nil {
return &APIError{
Code: resp.StatusCode,
Err: &validationErr,
}
}

return &validationErr
}

return fmt.Errorf("unsuccessful response: %s", resp.Status)
return &APIError{
Code: resp.StatusCode,
Err: errors.New(string(body)),
}
}

if out != nil {
Expand Down
Loading