diff options
| author | Jeff Carr <[email protected]> | 2025-06-29 02:49:00 -0500 | 
|---|---|---|
| committer | Jeff Carr <[email protected]> | 2025-06-29 02:49:00 -0500 | 
| commit | 4d61cadd81f94357cafc65bcbae71c22111de4b1 (patch) | |
| tree | e718001039305d24eb593c05da4a4b71d5904564 | |
| parent | 2ee8d4947bd6107e95168c01f278178e314081e9 (diff) | |
fixes UTF8 from really old git commits in ubootv0.0.5
| -rw-r--r-- | SanitizeUTF8.go | 145 | 
1 files changed, 145 insertions, 0 deletions
diff --git a/SanitizeUTF8.go b/SanitizeUTF8.go new file mode 100644 index 0000000..9ad4171 --- /dev/null +++ b/SanitizeUTF8.go @@ -0,0 +1,145 @@ +// Copyright 2017-2025 WIT.COM Inc. All rights reserved. +// Use of this source code is governed by the GPL 3.0 + +package bugpb + +import ( +	"bytes" +	"fmt" +	"io" +	"reflect" +	"unicode/utf8" + +	"golang.org/x/text/encoding/charmap" +	"google.golang.org/protobuf/proto" +) + +// ValidateProtoUTF8 checks all string fields in a proto.Message recursively. +func ValidateProtoUTF8(msg proto.Message) error { +	return validateValue(reflect.ValueOf(msg), "") +} + +func validateValue(val reflect.Value, path string) error { +	if !val.IsValid() { +		return nil +	} + +	if val.Kind() == reflect.Ptr { +		if val.IsNil() { +			return nil +		} +		return validateValue(val.Elem(), path) +	} + +	switch val.Kind() { +	case reflect.Struct: +		for i := 0; i < val.NumField(); i++ { +			field := val.Field(i) +			fieldType := val.Type().Field(i) +			fieldPath := fmt.Sprintf("%s.%s", path, fieldType.Name) +			if err := validateValue(field, fieldPath); err != nil { +				return err +			} +		} + +	case reflect.String: +		s := val.String() +		if !utf8.ValidString(s) { +			return fmt.Errorf("invalid UTF-8 string at %s: %q", path, s) +		} + +	case reflect.Slice: +		if val.Type().Elem().Kind() == reflect.Uint8 { +			return nil // skip []byte +		} +		for i := 0; i < val.Len(); i++ { +			if err := validateValue(val.Index(i), fmt.Sprintf("%s[%d]", path, i)); err != nil { +				return err +			} +		} + +	case reflect.Map: +		for _, key := range val.MapKeys() { +			valItem := val.MapIndex(key) +			if err := validateValue(valItem, fmt.Sprintf("%s[%v]", path, key)); err != nil { +				return err +			} +		} +	} + +	return nil +} + +// SanitizeProtoUTF8 fixes all invalid UTF-8 strings in a proto.Message recursively. +func SanitizeProtoUTF8(msg proto.Message) error { +	return sanitizeValue(reflect.ValueOf(msg), "") +} + +func sanitizeValue(val reflect.Value, path string) error { +	if !val.IsValid() { +		return nil +	} + +	if val.Kind() == reflect.Ptr { +		if val.IsNil() { +			return nil +		} +		return sanitizeValue(val.Elem(), path) +	} + +	switch val.Kind() { +	case reflect.Struct: +		for i := 0; i < val.NumField(); i++ { +			field := val.Field(i) +			fieldType := val.Type().Field(i) +			if !field.CanSet() { +				continue +			} +			if err := sanitizeValue(field, fmt.Sprintf("%s.%s", path, fieldType.Name)); err != nil { +				return err +			} +		} + +	case reflect.String: +		s := val.String() +		if !utf8.ValidString(s) { +			utf8Str, err := latin1ToUTF8(s) +			if err != nil { +				return fmt.Errorf("failed to convert %s to UTF-8: %v", path, err) +			} +			val.SetString(utf8Str) +		} + +	case reflect.Slice: +		if val.Type().Elem().Kind() == reflect.Uint8 { +			return nil // skip []byte +		} +		for i := 0; i < val.Len(); i++ { +			if err := sanitizeValue(val.Index(i), fmt.Sprintf("%s[%d]", path, i)); err != nil { +				return err +			} +		} + +	case reflect.Map: +		for _, key := range val.MapKeys() { +			valItem := val.MapIndex(key) +			newItem := reflect.New(valItem.Type()).Elem() +			newItem.Set(valItem) +			if err := sanitizeValue(newItem, fmt.Sprintf("%s[%v]", path, key)); err != nil { +				return err +			} +			val.SetMapIndex(key, newItem) +		} +	} + +	return nil +} + +func latin1ToUTF8(input string) (string, error) { +	reader := charmap.ISO8859_1.NewDecoder().Reader(bytes.NewReader([]byte(input))) +	result, err := io.ReadAll(reader) +	if err != nil { +		return "", err +	} +	return string(result), nil +}  | 
