You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

519 lines
15 KiB

  1. // Copyright 2015 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package bigquery
  15. import (
  16. "encoding/json"
  17. "errors"
  18. "fmt"
  19. "reflect"
  20. "sync"
  21. bq "google.golang.org/api/bigquery/v2"
  22. )
  23. // Schema describes the fields in a table or query result.
  24. type Schema []*FieldSchema
  25. // FieldSchema describes a single field.
  26. type FieldSchema struct {
  27. // The field name.
  28. // Must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
  29. // and must start with a letter or underscore.
  30. // The maximum length is 128 characters.
  31. Name string
  32. // A description of the field. The maximum length is 16,384 characters.
  33. Description string
  34. // Whether the field may contain multiple values.
  35. Repeated bool
  36. // Whether the field is required. Ignored if Repeated is true.
  37. Required bool
  38. // The field data type. If Type is Record, then this field contains a nested schema,
  39. // which is described by Schema.
  40. Type FieldType
  41. // Describes the nested schema if Type is set to Record.
  42. Schema Schema
  43. }
  44. func (fs *FieldSchema) toBQ() *bq.TableFieldSchema {
  45. tfs := &bq.TableFieldSchema{
  46. Description: fs.Description,
  47. Name: fs.Name,
  48. Type: string(fs.Type),
  49. }
  50. if fs.Repeated {
  51. tfs.Mode = "REPEATED"
  52. } else if fs.Required {
  53. tfs.Mode = "REQUIRED"
  54. } // else leave as default, which is interpreted as NULLABLE.
  55. for _, f := range fs.Schema {
  56. tfs.Fields = append(tfs.Fields, f.toBQ())
  57. }
  58. return tfs
  59. }
  60. func (s Schema) toBQ() *bq.TableSchema {
  61. var fields []*bq.TableFieldSchema
  62. for _, f := range s {
  63. fields = append(fields, f.toBQ())
  64. }
  65. return &bq.TableSchema{Fields: fields}
  66. }
  67. func bqToFieldSchema(tfs *bq.TableFieldSchema) *FieldSchema {
  68. fs := &FieldSchema{
  69. Description: tfs.Description,
  70. Name: tfs.Name,
  71. Repeated: tfs.Mode == "REPEATED",
  72. Required: tfs.Mode == "REQUIRED",
  73. Type: FieldType(tfs.Type),
  74. }
  75. for _, f := range tfs.Fields {
  76. fs.Schema = append(fs.Schema, bqToFieldSchema(f))
  77. }
  78. return fs
  79. }
  80. func bqToSchema(ts *bq.TableSchema) Schema {
  81. if ts == nil {
  82. return nil
  83. }
  84. var s Schema
  85. for _, f := range ts.Fields {
  86. s = append(s, bqToFieldSchema(f))
  87. }
  88. return s
  89. }
  90. // FieldType is the type of field.
  91. type FieldType string
  92. const (
  93. // StringFieldType is a string field type.
  94. StringFieldType FieldType = "STRING"
  95. // BytesFieldType is a bytes field type.
  96. BytesFieldType FieldType = "BYTES"
  97. // IntegerFieldType is a integer field type.
  98. IntegerFieldType FieldType = "INTEGER"
  99. // FloatFieldType is a float field type.
  100. FloatFieldType FieldType = "FLOAT"
  101. // BooleanFieldType is a boolean field type.
  102. BooleanFieldType FieldType = "BOOLEAN"
  103. // TimestampFieldType is a timestamp field type.
  104. TimestampFieldType FieldType = "TIMESTAMP"
  105. // RecordFieldType is a record field type. It is typically used to create columns with repeated or nested data.
  106. RecordFieldType FieldType = "RECORD"
  107. // DateFieldType is a date field type.
  108. DateFieldType FieldType = "DATE"
  109. // TimeFieldType is a time field type.
  110. TimeFieldType FieldType = "TIME"
  111. // DateTimeFieldType is a datetime field type.
  112. DateTimeFieldType FieldType = "DATETIME"
  113. // NumericFieldType is a numeric field type. Numeric types include integer types, floating point types and the
  114. // NUMERIC data type.
  115. NumericFieldType FieldType = "NUMERIC"
  116. // GeographyFieldType is a string field type. Geography types represent a set of points
  117. // on the Earth's surface, represented in Well Known Text (WKT) format.
  118. GeographyFieldType FieldType = "GEOGRAPHY"
  119. )
  120. var (
  121. errEmptyJSONSchema = errors.New("bigquery: empty JSON schema")
  122. fieldTypes = map[FieldType]bool{
  123. StringFieldType: true,
  124. BytesFieldType: true,
  125. IntegerFieldType: true,
  126. FloatFieldType: true,
  127. BooleanFieldType: true,
  128. TimestampFieldType: true,
  129. RecordFieldType: true,
  130. DateFieldType: true,
  131. TimeFieldType: true,
  132. DateTimeFieldType: true,
  133. NumericFieldType: true,
  134. GeographyFieldType: true,
  135. }
  136. )
  137. var typeOfByteSlice = reflect.TypeOf([]byte{})
  138. // InferSchema tries to derive a BigQuery schema from the supplied struct value.
  139. // Each exported struct field is mapped to a field in the schema.
  140. //
  141. // The following BigQuery types are inferred from the corresponding Go types.
  142. // (This is the same mapping as that used for RowIterator.Next.) Fields inferred
  143. // from these types are marked required (non-nullable).
  144. //
  145. // STRING string
  146. // BOOL bool
  147. // INTEGER int, int8, int16, int32, int64, uint8, uint16, uint32
  148. // FLOAT float32, float64
  149. // BYTES []byte
  150. // TIMESTAMP time.Time
  151. // DATE civil.Date
  152. // TIME civil.Time
  153. // DATETIME civil.DateTime
  154. // NUMERIC *big.Rat
  155. //
  156. // The big.Rat type supports numbers of arbitrary size and precision. Values
  157. // will be rounded to 9 digits after the decimal point before being transmitted
  158. // to BigQuery. See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type
  159. // for more on NUMERIC.
  160. //
  161. // A Go slice or array type is inferred to be a BigQuery repeated field of the
  162. // element type. The element type must be one of the above listed types.
  163. //
  164. // Due to lack of unique native Go type for GEOGRAPHY, there is no schema
  165. // inference to GEOGRAPHY at this time.
  166. //
  167. // Nullable fields are inferred from the NullXXX types, declared in this package:
  168. //
  169. // STRING NullString
  170. // BOOL NullBool
  171. // INTEGER NullInt64
  172. // FLOAT NullFloat64
  173. // TIMESTAMP NullTimestamp
  174. // DATE NullDate
  175. // TIME NullTime
  176. // DATETIME NullDateTime
  177. // GEOGRAPHY NullGeography
  178. //
  179. // For a nullable BYTES field, use the type []byte and tag the field "nullable" (see below).
  180. // For a nullable NUMERIC field, use the type *big.Rat and tag the field "nullable".
  181. //
  182. // A struct field that is of struct type is inferred to be a required field of type
  183. // RECORD with a schema inferred recursively. For backwards compatibility, a field of
  184. // type pointer to struct is also inferred to be required. To get a nullable RECORD
  185. // field, use the "nullable" tag (see below).
  186. //
  187. // InferSchema returns an error if any of the examined fields is of type uint,
  188. // uint64, uintptr, map, interface, complex64, complex128, func, or chan. Future
  189. // versions may handle these cases without error.
  190. //
  191. // Recursively defined structs are also disallowed.
  192. //
  193. // Struct fields may be tagged in a way similar to the encoding/json package.
  194. // A tag of the form
  195. // bigquery:"name"
  196. // uses "name" instead of the struct field name as the BigQuery field name.
  197. // A tag of the form
  198. // bigquery:"-"
  199. // omits the field from the inferred schema.
  200. // The "nullable" option marks the field as nullable (not required). It is only
  201. // needed for []byte, *big.Rat and pointer-to-struct fields, and cannot appear on other
  202. // fields. In this example, the Go name of the field is retained:
  203. // bigquery:",nullable"
  204. func InferSchema(st interface{}) (Schema, error) {
  205. return inferSchemaReflectCached(reflect.TypeOf(st))
  206. }
  207. var schemaCache sync.Map
  208. type cacheVal struct {
  209. schema Schema
  210. err error
  211. }
  212. func inferSchemaReflectCached(t reflect.Type) (Schema, error) {
  213. var cv cacheVal
  214. v, ok := schemaCache.Load(t)
  215. if ok {
  216. cv = v.(cacheVal)
  217. } else {
  218. s, err := inferSchemaReflect(t)
  219. cv = cacheVal{s, err}
  220. schemaCache.Store(t, cv)
  221. }
  222. return cv.schema, cv.err
  223. }
  224. func inferSchemaReflect(t reflect.Type) (Schema, error) {
  225. rec, err := hasRecursiveType(t, nil)
  226. if err != nil {
  227. return nil, err
  228. }
  229. if rec {
  230. return nil, fmt.Errorf("bigquery: schema inference for recursive type %s", t)
  231. }
  232. return inferStruct(t)
  233. }
  234. func inferStruct(t reflect.Type) (Schema, error) {
  235. switch t.Kind() {
  236. case reflect.Ptr:
  237. if t.Elem().Kind() != reflect.Struct {
  238. return nil, noStructError{t}
  239. }
  240. t = t.Elem()
  241. fallthrough
  242. case reflect.Struct:
  243. return inferFields(t)
  244. default:
  245. return nil, noStructError{t}
  246. }
  247. }
  248. // inferFieldSchema infers the FieldSchema for a Go type
  249. func inferFieldSchema(fieldName string, rt reflect.Type, nullable bool) (*FieldSchema, error) {
  250. // Only []byte and struct pointers can be tagged nullable.
  251. if nullable && !(rt == typeOfByteSlice || rt.Kind() == reflect.Ptr && rt.Elem().Kind() == reflect.Struct) {
  252. return nil, badNullableError{fieldName, rt}
  253. }
  254. switch rt {
  255. case typeOfByteSlice:
  256. return &FieldSchema{Required: !nullable, Type: BytesFieldType}, nil
  257. case typeOfGoTime:
  258. return &FieldSchema{Required: true, Type: TimestampFieldType}, nil
  259. case typeOfDate:
  260. return &FieldSchema{Required: true, Type: DateFieldType}, nil
  261. case typeOfTime:
  262. return &FieldSchema{Required: true, Type: TimeFieldType}, nil
  263. case typeOfDateTime:
  264. return &FieldSchema{Required: true, Type: DateTimeFieldType}, nil
  265. case typeOfRat:
  266. return &FieldSchema{Required: !nullable, Type: NumericFieldType}, nil
  267. }
  268. if ft := nullableFieldType(rt); ft != "" {
  269. return &FieldSchema{Required: false, Type: ft}, nil
  270. }
  271. if isSupportedIntType(rt) || isSupportedUintType(rt) {
  272. return &FieldSchema{Required: true, Type: IntegerFieldType}, nil
  273. }
  274. switch rt.Kind() {
  275. case reflect.Slice, reflect.Array:
  276. et := rt.Elem()
  277. if et != typeOfByteSlice && (et.Kind() == reflect.Slice || et.Kind() == reflect.Array) {
  278. // Multi dimensional slices/arrays are not supported by BigQuery
  279. return nil, unsupportedFieldTypeError{fieldName, rt}
  280. }
  281. if nullableFieldType(et) != "" {
  282. // Repeated nullable types are not supported by BigQuery.
  283. return nil, unsupportedFieldTypeError{fieldName, rt}
  284. }
  285. f, err := inferFieldSchema(fieldName, et, false)
  286. if err != nil {
  287. return nil, err
  288. }
  289. f.Repeated = true
  290. f.Required = false
  291. return f, nil
  292. case reflect.Ptr:
  293. if rt.Elem().Kind() != reflect.Struct {
  294. return nil, unsupportedFieldTypeError{fieldName, rt}
  295. }
  296. fallthrough
  297. case reflect.Struct:
  298. nested, err := inferStruct(rt)
  299. if err != nil {
  300. return nil, err
  301. }
  302. return &FieldSchema{Required: !nullable, Type: RecordFieldType, Schema: nested}, nil
  303. case reflect.String:
  304. return &FieldSchema{Required: !nullable, Type: StringFieldType}, nil
  305. case reflect.Bool:
  306. return &FieldSchema{Required: !nullable, Type: BooleanFieldType}, nil
  307. case reflect.Float32, reflect.Float64:
  308. return &FieldSchema{Required: !nullable, Type: FloatFieldType}, nil
  309. default:
  310. return nil, unsupportedFieldTypeError{fieldName, rt}
  311. }
  312. }
  313. // inferFields extracts all exported field types from struct type.
  314. func inferFields(rt reflect.Type) (Schema, error) {
  315. var s Schema
  316. fields, err := fieldCache.Fields(rt)
  317. if err != nil {
  318. return nil, err
  319. }
  320. for _, field := range fields {
  321. var nullable bool
  322. for _, opt := range field.ParsedTag.([]string) {
  323. if opt == nullableTagOption {
  324. nullable = true
  325. break
  326. }
  327. }
  328. f, err := inferFieldSchema(field.Name, field.Type, nullable)
  329. if err != nil {
  330. return nil, err
  331. }
  332. f.Name = field.Name
  333. s = append(s, f)
  334. }
  335. return s, nil
  336. }
  337. // isSupportedIntType reports whether t is an int type that can be properly
  338. // represented by the BigQuery INTEGER/INT64 type.
  339. func isSupportedIntType(t reflect.Type) bool {
  340. switch t.Kind() {
  341. case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
  342. return true
  343. default:
  344. return false
  345. }
  346. }
  347. // isSupportedIntType reports whether t is a uint type that can be properly
  348. // represented by the BigQuery INTEGER/INT64 type.
  349. func isSupportedUintType(t reflect.Type) bool {
  350. switch t.Kind() {
  351. case reflect.Uint8, reflect.Uint16, reflect.Uint32:
  352. return true
  353. default:
  354. return false
  355. }
  356. }
  357. // typeList is a linked list of reflect.Types.
  358. type typeList struct {
  359. t reflect.Type
  360. next *typeList
  361. }
  362. func (l *typeList) has(t reflect.Type) bool {
  363. for l != nil {
  364. if l.t == t {
  365. return true
  366. }
  367. l = l.next
  368. }
  369. return false
  370. }
  371. // hasRecursiveType reports whether t or any type inside t refers to itself, directly or indirectly,
  372. // via exported fields. (Schema inference ignores unexported fields.)
  373. func hasRecursiveType(t reflect.Type, seen *typeList) (bool, error) {
  374. for t.Kind() == reflect.Ptr || t.Kind() == reflect.Slice || t.Kind() == reflect.Array {
  375. t = t.Elem()
  376. }
  377. if t.Kind() != reflect.Struct {
  378. return false, nil
  379. }
  380. if seen.has(t) {
  381. return true, nil
  382. }
  383. fields, err := fieldCache.Fields(t)
  384. if err != nil {
  385. return false, err
  386. }
  387. seen = &typeList{t, seen}
  388. // Because seen is a linked list, additions to it from one field's
  389. // recursive call will not affect the value for subsequent fields' calls.
  390. for _, field := range fields {
  391. ok, err := hasRecursiveType(field.Type, seen)
  392. if err != nil {
  393. return false, err
  394. }
  395. if ok {
  396. return true, nil
  397. }
  398. }
  399. return false, nil
  400. }
  401. // bigQuerySchemaJSONField is an individual field in a JSON BigQuery table schema definition
  402. // (as generated by https://github.com/GoogleCloudPlatform/protoc-gen-bq-schema).
  403. type bigQueryJSONField struct {
  404. Description string `json:"description"`
  405. Fields []bigQueryJSONField `json:"fields"`
  406. Mode string `json:"mode"`
  407. Name string `json:"name"`
  408. Type string `json:"type"`
  409. }
  410. // convertSchemaFromJSON generates a Schema:
  411. func convertSchemaFromJSON(fs []bigQueryJSONField) (Schema, error) {
  412. convertedSchema := Schema{}
  413. for _, f := range fs {
  414. convertedFieldSchema := &FieldSchema{
  415. Description: f.Description,
  416. Name: f.Name,
  417. Required: f.Mode == "REQUIRED",
  418. Repeated: f.Mode == "REPEATED",
  419. }
  420. if len(f.Fields) > 0 {
  421. convertedNestedFieldSchema, err := convertSchemaFromJSON(f.Fields)
  422. if err != nil {
  423. return nil, err
  424. }
  425. convertedFieldSchema.Schema = convertedNestedFieldSchema
  426. }
  427. // Check that the field-type (string) maps to a known FieldType:
  428. if _, ok := fieldTypes[FieldType(f.Type)]; !ok {
  429. return nil, fmt.Errorf("unknown field type (%v)", f.Type)
  430. }
  431. convertedFieldSchema.Type = FieldType(f.Type)
  432. convertedSchema = append(convertedSchema, convertedFieldSchema)
  433. }
  434. return convertedSchema, nil
  435. }
  436. // SchemaFromJSON takes a JSON BigQuery table schema definition
  437. // (as generated by https://github.com/GoogleCloudPlatform/protoc-gen-bq-schema)
  438. // and returns a fully-populated Schema.
  439. func SchemaFromJSON(schemaJSON []byte) (Schema, error) {
  440. var bigQuerySchema []bigQueryJSONField
  441. // Make sure we actually have some content:
  442. if len(schemaJSON) == 0 {
  443. return nil, errEmptyJSONSchema
  444. }
  445. if err := json.Unmarshal(schemaJSON, &bigQuerySchema); err != nil {
  446. return nil, err
  447. }
  448. return convertSchemaFromJSON(bigQuerySchema)
  449. }
  450. type noStructError struct {
  451. typ reflect.Type
  452. }
  453. func (e noStructError) Error() string {
  454. return fmt.Sprintf("bigquery: can only infer schema from struct or pointer to struct, not %s", e.typ)
  455. }
  456. type badNullableError struct {
  457. name string
  458. typ reflect.Type
  459. }
  460. func (e badNullableError) Error() string {
  461. return fmt.Sprintf(`bigquery: field %q of type %s: use "nullable" only for []byte and struct pointers; for all other types, use a NullXXX type`, e.name, e.typ)
  462. }
  463. type unsupportedFieldTypeError struct {
  464. name string
  465. typ reflect.Type
  466. }
  467. func (e unsupportedFieldTypeError) Error() string {
  468. return fmt.Sprintf("bigquery: field %q: type %s is not supported", e.name, e.typ)
  469. }