// Copyright 2015 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package bigquery import ( "encoding/json" "errors" "fmt" "reflect" "sync" bq "google.golang.org/api/bigquery/v2" ) // Schema describes the fields in a table or query result. type Schema []*FieldSchema // FieldSchema describes a single field. type FieldSchema struct { // The field name. // Must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_), // and must start with a letter or underscore. // The maximum length is 128 characters. Name string // A description of the field. The maximum length is 16,384 characters. Description string // Whether the field may contain multiple values. Repeated bool // Whether the field is required. Ignored if Repeated is true. Required bool // The field data type. If Type is Record, then this field contains a nested schema, // which is described by Schema. Type FieldType // Describes the nested schema if Type is set to Record. Schema Schema } func (fs *FieldSchema) toBQ() *bq.TableFieldSchema { tfs := &bq.TableFieldSchema{ Description: fs.Description, Name: fs.Name, Type: string(fs.Type), } if fs.Repeated { tfs.Mode = "REPEATED" } else if fs.Required { tfs.Mode = "REQUIRED" } // else leave as default, which is interpreted as NULLABLE. for _, f := range fs.Schema { tfs.Fields = append(tfs.Fields, f.toBQ()) } return tfs } func (s Schema) toBQ() *bq.TableSchema { var fields []*bq.TableFieldSchema for _, f := range s { fields = append(fields, f.toBQ()) } return &bq.TableSchema{Fields: fields} } func bqToFieldSchema(tfs *bq.TableFieldSchema) *FieldSchema { fs := &FieldSchema{ Description: tfs.Description, Name: tfs.Name, Repeated: tfs.Mode == "REPEATED", Required: tfs.Mode == "REQUIRED", Type: FieldType(tfs.Type), } for _, f := range tfs.Fields { fs.Schema = append(fs.Schema, bqToFieldSchema(f)) } return fs } func bqToSchema(ts *bq.TableSchema) Schema { if ts == nil { return nil } var s Schema for _, f := range ts.Fields { s = append(s, bqToFieldSchema(f)) } return s } // FieldType is the type of field. type FieldType string const ( // StringFieldType is a string field type. StringFieldType FieldType = "STRING" // BytesFieldType is a bytes field type. BytesFieldType FieldType = "BYTES" // IntegerFieldType is a integer field type. IntegerFieldType FieldType = "INTEGER" // FloatFieldType is a float field type. FloatFieldType FieldType = "FLOAT" // BooleanFieldType is a boolean field type. BooleanFieldType FieldType = "BOOLEAN" // TimestampFieldType is a timestamp field type. TimestampFieldType FieldType = "TIMESTAMP" // RecordFieldType is a record field type. It is typically used to create columns with repeated or nested data. RecordFieldType FieldType = "RECORD" // DateFieldType is a date field type. DateFieldType FieldType = "DATE" // TimeFieldType is a time field type. TimeFieldType FieldType = "TIME" // DateTimeFieldType is a datetime field type. DateTimeFieldType FieldType = "DATETIME" // NumericFieldType is a numeric field type. Numeric types include integer types, floating point types and the // NUMERIC data type. NumericFieldType FieldType = "NUMERIC" // GeographyFieldType is a string field type. Geography types represent a set of points // on the Earth's surface, represented in Well Known Text (WKT) format. GeographyFieldType FieldType = "GEOGRAPHY" ) var ( errEmptyJSONSchema = errors.New("bigquery: empty JSON schema") fieldTypes = map[FieldType]bool{ StringFieldType: true, BytesFieldType: true, IntegerFieldType: true, FloatFieldType: true, BooleanFieldType: true, TimestampFieldType: true, RecordFieldType: true, DateFieldType: true, TimeFieldType: true, DateTimeFieldType: true, NumericFieldType: true, GeographyFieldType: true, } ) var typeOfByteSlice = reflect.TypeOf([]byte{}) // InferSchema tries to derive a BigQuery schema from the supplied struct value. // Each exported struct field is mapped to a field in the schema. // // The following BigQuery types are inferred from the corresponding Go types. // (This is the same mapping as that used for RowIterator.Next.) Fields inferred // from these types are marked required (non-nullable). // // STRING string // BOOL bool // INTEGER int, int8, int16, int32, int64, uint8, uint16, uint32 // FLOAT float32, float64 // BYTES []byte // TIMESTAMP time.Time // DATE civil.Date // TIME civil.Time // DATETIME civil.DateTime // NUMERIC *big.Rat // // The big.Rat type supports numbers of arbitrary size and precision. Values // will be rounded to 9 digits after the decimal point before being transmitted // to BigQuery. See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type // for more on NUMERIC. // // A Go slice or array type is inferred to be a BigQuery repeated field of the // element type. The element type must be one of the above listed types. // // Due to lack of unique native Go type for GEOGRAPHY, there is no schema // inference to GEOGRAPHY at this time. // // Nullable fields are inferred from the NullXXX types, declared in this package: // // STRING NullString // BOOL NullBool // INTEGER NullInt64 // FLOAT NullFloat64 // TIMESTAMP NullTimestamp // DATE NullDate // TIME NullTime // DATETIME NullDateTime // GEOGRAPHY NullGeography // // For a nullable BYTES field, use the type []byte and tag the field "nullable" (see below). // For a nullable NUMERIC field, use the type *big.Rat and tag the field "nullable". // // A struct field that is of struct type is inferred to be a required field of type // RECORD with a schema inferred recursively. For backwards compatibility, a field of // type pointer to struct is also inferred to be required. To get a nullable RECORD // field, use the "nullable" tag (see below). // // InferSchema returns an error if any of the examined fields is of type uint, // uint64, uintptr, map, interface, complex64, complex128, func, or chan. Future // versions may handle these cases without error. // // Recursively defined structs are also disallowed. // // Struct fields may be tagged in a way similar to the encoding/json package. // A tag of the form // bigquery:"name" // uses "name" instead of the struct field name as the BigQuery field name. // A tag of the form // bigquery:"-" // omits the field from the inferred schema. // The "nullable" option marks the field as nullable (not required). It is only // needed for []byte, *big.Rat and pointer-to-struct fields, and cannot appear on other // fields. In this example, the Go name of the field is retained: // bigquery:",nullable" func InferSchema(st interface{}) (Schema, error) { return inferSchemaReflectCached(reflect.TypeOf(st)) } var schemaCache sync.Map type cacheVal struct { schema Schema err error } func inferSchemaReflectCached(t reflect.Type) (Schema, error) { var cv cacheVal v, ok := schemaCache.Load(t) if ok { cv = v.(cacheVal) } else { s, err := inferSchemaReflect(t) cv = cacheVal{s, err} schemaCache.Store(t, cv) } return cv.schema, cv.err } func inferSchemaReflect(t reflect.Type) (Schema, error) { rec, err := hasRecursiveType(t, nil) if err != nil { return nil, err } if rec { return nil, fmt.Errorf("bigquery: schema inference for recursive type %s", t) } return inferStruct(t) } func inferStruct(t reflect.Type) (Schema, error) { switch t.Kind() { case reflect.Ptr: if t.Elem().Kind() != reflect.Struct { return nil, noStructError{t} } t = t.Elem() fallthrough case reflect.Struct: return inferFields(t) default: return nil, noStructError{t} } } // inferFieldSchema infers the FieldSchema for a Go type func inferFieldSchema(fieldName string, rt reflect.Type, nullable bool) (*FieldSchema, error) { // Only []byte and struct pointers can be tagged nullable. if nullable && !(rt == typeOfByteSlice || rt.Kind() == reflect.Ptr && rt.Elem().Kind() == reflect.Struct) { return nil, badNullableError{fieldName, rt} } switch rt { case typeOfByteSlice: return &FieldSchema{Required: !nullable, Type: BytesFieldType}, nil case typeOfGoTime: return &FieldSchema{Required: true, Type: TimestampFieldType}, nil case typeOfDate: return &FieldSchema{Required: true, Type: DateFieldType}, nil case typeOfTime: return &FieldSchema{Required: true, Type: TimeFieldType}, nil case typeOfDateTime: return &FieldSchema{Required: true, Type: DateTimeFieldType}, nil case typeOfRat: return &FieldSchema{Required: !nullable, Type: NumericFieldType}, nil } if ft := nullableFieldType(rt); ft != "" { return &FieldSchema{Required: false, Type: ft}, nil } if isSupportedIntType(rt) || isSupportedUintType(rt) { return &FieldSchema{Required: true, Type: IntegerFieldType}, nil } switch rt.Kind() { case reflect.Slice, reflect.Array: et := rt.Elem() if et != typeOfByteSlice && (et.Kind() == reflect.Slice || et.Kind() == reflect.Array) { // Multi dimensional slices/arrays are not supported by BigQuery return nil, unsupportedFieldTypeError{fieldName, rt} } if nullableFieldType(et) != "" { // Repeated nullable types are not supported by BigQuery. return nil, unsupportedFieldTypeError{fieldName, rt} } f, err := inferFieldSchema(fieldName, et, false) if err != nil { return nil, err } f.Repeated = true f.Required = false return f, nil case reflect.Ptr: if rt.Elem().Kind() != reflect.Struct { return nil, unsupportedFieldTypeError{fieldName, rt} } fallthrough case reflect.Struct: nested, err := inferStruct(rt) if err != nil { return nil, err } return &FieldSchema{Required: !nullable, Type: RecordFieldType, Schema: nested}, nil case reflect.String: return &FieldSchema{Required: !nullable, Type: StringFieldType}, nil case reflect.Bool: return &FieldSchema{Required: !nullable, Type: BooleanFieldType}, nil case reflect.Float32, reflect.Float64: return &FieldSchema{Required: !nullable, Type: FloatFieldType}, nil default: return nil, unsupportedFieldTypeError{fieldName, rt} } } // inferFields extracts all exported field types from struct type. func inferFields(rt reflect.Type) (Schema, error) { var s Schema fields, err := fieldCache.Fields(rt) if err != nil { return nil, err } for _, field := range fields { var nullable bool for _, opt := range field.ParsedTag.([]string) { if opt == nullableTagOption { nullable = true break } } f, err := inferFieldSchema(field.Name, field.Type, nullable) if err != nil { return nil, err } f.Name = field.Name s = append(s, f) } return s, nil } // isSupportedIntType reports whether t is an int type that can be properly // represented by the BigQuery INTEGER/INT64 type. func isSupportedIntType(t reflect.Type) bool { switch t.Kind() { case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int: return true default: return false } } // isSupportedIntType reports whether t is a uint type that can be properly // represented by the BigQuery INTEGER/INT64 type. func isSupportedUintType(t reflect.Type) bool { switch t.Kind() { case reflect.Uint8, reflect.Uint16, reflect.Uint32: return true default: return false } } // typeList is a linked list of reflect.Types. type typeList struct { t reflect.Type next *typeList } func (l *typeList) has(t reflect.Type) bool { for l != nil { if l.t == t { return true } l = l.next } return false } // hasRecursiveType reports whether t or any type inside t refers to itself, directly or indirectly, // via exported fields. (Schema inference ignores unexported fields.) func hasRecursiveType(t reflect.Type, seen *typeList) (bool, error) { for t.Kind() == reflect.Ptr || t.Kind() == reflect.Slice || t.Kind() == reflect.Array { t = t.Elem() } if t.Kind() != reflect.Struct { return false, nil } if seen.has(t) { return true, nil } fields, err := fieldCache.Fields(t) if err != nil { return false, err } seen = &typeList{t, seen} // Because seen is a linked list, additions to it from one field's // recursive call will not affect the value for subsequent fields' calls. for _, field := range fields { ok, err := hasRecursiveType(field.Type, seen) if err != nil { return false, err } if ok { return true, nil } } return false, nil } // bigQuerySchemaJSONField is an individual field in a JSON BigQuery table schema definition // (as generated by https://github.com/GoogleCloudPlatform/protoc-gen-bq-schema). type bigQueryJSONField struct { Description string `json:"description"` Fields []bigQueryJSONField `json:"fields"` Mode string `json:"mode"` Name string `json:"name"` Type string `json:"type"` } // convertSchemaFromJSON generates a Schema: func convertSchemaFromJSON(fs []bigQueryJSONField) (Schema, error) { convertedSchema := Schema{} for _, f := range fs { convertedFieldSchema := &FieldSchema{ Description: f.Description, Name: f.Name, Required: f.Mode == "REQUIRED", Repeated: f.Mode == "REPEATED", } if len(f.Fields) > 0 { convertedNestedFieldSchema, err := convertSchemaFromJSON(f.Fields) if err != nil { return nil, err } convertedFieldSchema.Schema = convertedNestedFieldSchema } // Check that the field-type (string) maps to a known FieldType: if _, ok := fieldTypes[FieldType(f.Type)]; !ok { return nil, fmt.Errorf("unknown field type (%v)", f.Type) } convertedFieldSchema.Type = FieldType(f.Type) convertedSchema = append(convertedSchema, convertedFieldSchema) } return convertedSchema, nil } // SchemaFromJSON takes a JSON BigQuery table schema definition // (as generated by https://github.com/GoogleCloudPlatform/protoc-gen-bq-schema) // and returns a fully-populated Schema. func SchemaFromJSON(schemaJSON []byte) (Schema, error) { var bigQuerySchema []bigQueryJSONField // Make sure we actually have some content: if len(schemaJSON) == 0 { return nil, errEmptyJSONSchema } if err := json.Unmarshal(schemaJSON, &bigQuerySchema); err != nil { return nil, err } return convertSchemaFromJSON(bigQuerySchema) } type noStructError struct { typ reflect.Type } func (e noStructError) Error() string { return fmt.Sprintf("bigquery: can only infer schema from struct or pointer to struct, not %s", e.typ) } type badNullableError struct { name string typ reflect.Type } func (e badNullableError) Error() string { return fmt.Sprintf(`bigquery: field %q of type %s: use "nullable" only for []byte and struct pointers; for all other types, use a NullXXX type`, e.name, e.typ) } type unsupportedFieldTypeError struct { name string typ reflect.Type } func (e unsupportedFieldTypeError) Error() string { return fmt.Sprintf("bigquery: field %q: type %s is not supported", e.name, e.typ) }