@@ -27,6 +27,7 @@ package vet
2727import (
2828 "bytes"
2929 "context"
30+ "encoding/json"
3031 "fmt"
3132 "reflect"
3233 "strings"
@@ -91,8 +92,8 @@ func Evaluate(_ context.Context, p *beam.Pipeline) (*Eval, error) {
9192 e := newEval ()
9293
9394 e .diag ("/**\n " )
94- e .extractFromMultiEdges (edges )
95- return e , nil
95+ err = e .extractFromMultiEdges (edges )
96+ return e , err
9697}
9798
9899func newEval () * Eval {
@@ -133,22 +134,27 @@ type Eval struct {
133134
134135// extractFromMultiEdges audits the given pipeline edges so we can determine if
135136// this pipeline will run without reflection.
136- func (e * Eval ) extractFromMultiEdges (edges []* graph.MultiEdge ) {
137+ func (e * Eval ) extractFromMultiEdges (edges []* graph.MultiEdge ) error {
137138 e .diag ("PTransform Audit:\n " )
138139 for _ , edge := range edges {
139140 switch edge .Op {
140141 case graph .ParDo :
141142 // Gets the ParDo's identifier
142143 e .diagf ("pardo %s" , edge .Name ())
143- e .extractGraphFn ((* graph .Fn )(edge .DoFn ))
144+ if err := e .extractGraphFn ((* graph .Fn )(edge .DoFn )); err != nil {
145+ return err
146+ }
144147 case graph .Combine :
145148 e .diagf ("combine %s" , edge .Name ())
146- e .extractGraphFn ((* graph .Fn )(edge .CombineFn ))
149+ if err := e .extractGraphFn ((* graph .Fn )(edge .CombineFn )); err != nil {
150+ return err
151+ }
147152 default :
148153 continue
149154 }
150155 e .diag ("\n " )
151156 }
157+ return nil
152158}
153159
154160// Performant returns whether this pipeline needs additional registrations
@@ -485,6 +491,73 @@ func (e *Eval) Bytes() []byte {
485491 return e .w .Bytes ()
486492}
487493
494+ // checkStructFieldsUTF8 recursively validates that all string fields in the
495+ // given value are UTF-8 compliant.
496+ // It handles structs, slices, arrays, maps, and individual strings while
497+ // avoiding infinite recursion on circular references.
498+ // The function skips validation for types that implement both json.Marshaler
499+ // and json.Unmarshaler interfaces.
500+ //
501+ // Parameters:
502+ // - v: reflect.Value to check
503+ // - seen: map tracking visited values to prevent infinite recursion
504+ //
505+ // Returns:
506+ // - error if any string field contains invalid UTF-8 encoding, nil otherwise
507+ func (e * Eval ) checkStructFieldsUTF8 (v reflect.Value , seen map [reflect.Value ]bool ) error {
508+ if ! v .IsValid () || seen [v ] {
509+ return nil
510+ }
511+
512+ // Track visited values to prevent infinite recursion on circular references.
513+ seen [v ] = true
514+
515+ t := v .Type ()
516+
517+ // Skip if type implements JSON marshaling.
518+ _ , hasMarshaler := reflect .New (t ).Interface ().(json.Marshaler )
519+ _ , hasUnmarshaler := reflect .New (t ).Interface ().(json.Unmarshaler )
520+ if hasMarshaler && hasUnmarshaler {
521+ return nil
522+ }
523+
524+ switch t .Kind () {
525+ case reflect .Struct :
526+ for i := 0 ; i < v .NumField (); i ++ {
527+ field := v .Field (i )
528+ if ! field .CanInterface () {
529+ // Skip unexported fields.
530+ continue
531+ }
532+ if err := e .checkStructFieldsUTF8 (field , seen ); err != nil {
533+ return err
534+ }
535+ }
536+ case reflect .Slice , reflect .Array :
537+ for i := 0 ; i < v .Len (); i ++ {
538+ if err := e .checkStructFieldsUTF8 (v .Index (i ), seen ); err != nil {
539+ return err
540+ }
541+ }
542+ case reflect .Map :
543+ iter := v .MapRange ()
544+ for iter .Next () {
545+ if err := e .checkStructFieldsUTF8 (iter .Key (), seen ); err != nil {
546+ return err
547+ }
548+ if err := e .checkStructFieldsUTF8 (iter .Value (), seen ); err != nil {
549+ return err
550+ }
551+ }
552+ case reflect .String :
553+ str := v .String ()
554+ if ! utf8 .ValidString (str ) {
555+ return fmt .Errorf ("non-UTF8 compliant string found: %q" , str )
556+ }
557+ }
558+ return nil
559+ }
560+
488561// We need to take graph.Fns (which can be created from any from graph.NewFn)
489562// and convert them to all needed function caller signatures,
490563// and emitters.
@@ -500,17 +573,29 @@ func (e *Eval) Bytes() []byte {
500573
501574// extractGraphFn does the analysis of the function and determines what things need generating.
502575// A single line is used, unless it's a struct, at which point one line per implemented method
503- // is used.
504- func (e * Eval ) extractGraphFn (fn * graph.Fn ) {
576+ // is used. For structs, it also validates UTF-8 compliance of all exported string fields.
577+ func (e * Eval ) extractGraphFn (fn * graph.Fn ) error {
505578 if fn .DynFn != nil {
506579 // TODO(https://github.com/apache/beam/issues/19401) handle dynamics if necessary (probably not since it's got general function handling)
507580 e .diag (" dynamic function" )
508- return
581+ return nil
509582 }
510583 if fn .Recv != nil {
511584 e .diagf (" struct[[%T]]" , fn .Recv )
512585
513- rt := reflectx .SkipPtr (reflect .TypeOf (fn .Recv )) // We need the value not the pointer that's used.
586+ // We need the value not the pointer that's used.
587+ rt := reflectx .SkipPtr (reflect .TypeOf (fn .Recv ))
588+ rv := reflect .ValueOf (fn .Recv )
589+ if rv .Kind () == reflect .Ptr {
590+ rv = rv .Elem ()
591+ }
592+
593+ // Add UTF-8 compliance check for struct fields.
594+ seen := make (map [reflect.Value ]bool )
595+ if err := e .checkStructFieldsUTF8 (rv , seen ); err != nil {
596+ return err
597+ }
598+
514599 if tk , ok := runtime .TypeKey (rt ); ok {
515600 if t , found := runtime .LookupType (tk ); ! found {
516601 e .needType (tk , rt )
@@ -532,6 +617,8 @@ func (e *Eval) extractGraphFn(fn *graph.Fn) {
532617 }
533618 e .extractFuncxFn (fn .Fn )
534619 }
620+
621+ return nil
535622}
536623
537624type mthd struct {
0 commit comments