|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399 |
- // Copyright 2017 Google LLC
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
-
- package bigquery
-
- import (
- "encoding/base64"
- "unicode/utf8"
-
- bq "google.golang.org/api/bigquery/v2"
- )
-
- // DataFormat describes the format of BigQuery table data.
- type DataFormat string
-
- // Constants describing the format of BigQuery table data.
- const (
- CSV DataFormat = "CSV"
- Avro DataFormat = "AVRO"
- JSON DataFormat = "NEWLINE_DELIMITED_JSON"
- DatastoreBackup DataFormat = "DATASTORE_BACKUP"
- GoogleSheets DataFormat = "GOOGLE_SHEETS"
- Bigtable DataFormat = "BIGTABLE"
- Parquet DataFormat = "PARQUET"
- )
-
- // ExternalData is a table which is stored outside of BigQuery. It is implemented by
- // *ExternalDataConfig.
- // GCSReference also implements it, for backwards compatibility.
- type ExternalData interface {
- toBQ() bq.ExternalDataConfiguration
- }
-
- // ExternalDataConfig describes data external to BigQuery that can be used
- // in queries and to create external tables.
- type ExternalDataConfig struct {
- // The format of the data. Required.
- SourceFormat DataFormat
-
- // The fully-qualified URIs that point to your
- // data in Google Cloud. Required.
- //
- // For Google Cloud Storage URIs, each URI can contain one '*' wildcard character
- // and it must come after the 'bucket' name. Size limits related to load jobs
- // apply to external data sources.
- //
- // For Google Cloud Bigtable URIs, exactly one URI can be specified and it has be
- // a fully specified and valid HTTPS URL for a Google Cloud Bigtable table.
- //
- // For Google Cloud Datastore backups, exactly one URI can be specified. Also,
- // the '*' wildcard character is not allowed.
- SourceURIs []string
-
- // The schema of the data. Required for CSV and JSON; disallowed for the
- // other formats.
- Schema Schema
-
- // Try to detect schema and format options automatically.
- // Any option specified explicitly will be honored.
- AutoDetect bool
-
- // The compression type of the data.
- Compression Compression
-
- // IgnoreUnknownValues causes values not matching the schema to be
- // tolerated. Unknown values are ignored. For CSV this ignores extra values
- // at the end of a line. For JSON this ignores named values that do not
- // match any column name. If this field is not set, records containing
- // unknown values are treated as bad records. The MaxBadRecords field can
- // be used to customize how bad records are handled.
- IgnoreUnknownValues bool
-
- // MaxBadRecords is the maximum number of bad records that will be ignored
- // when reading data.
- MaxBadRecords int64
-
- // Additional options for CSV, GoogleSheets and Bigtable formats.
- Options ExternalDataConfigOptions
- }
-
- func (e *ExternalDataConfig) toBQ() bq.ExternalDataConfiguration {
- q := bq.ExternalDataConfiguration{
- SourceFormat: string(e.SourceFormat),
- SourceUris: e.SourceURIs,
- Autodetect: e.AutoDetect,
- Compression: string(e.Compression),
- IgnoreUnknownValues: e.IgnoreUnknownValues,
- MaxBadRecords: e.MaxBadRecords,
- }
- if e.Schema != nil {
- q.Schema = e.Schema.toBQ()
- }
- if e.Options != nil {
- e.Options.populateExternalDataConfig(&q)
- }
- return q
- }
-
- func bqToExternalDataConfig(q *bq.ExternalDataConfiguration) (*ExternalDataConfig, error) {
- e := &ExternalDataConfig{
- SourceFormat: DataFormat(q.SourceFormat),
- SourceURIs: q.SourceUris,
- AutoDetect: q.Autodetect,
- Compression: Compression(q.Compression),
- IgnoreUnknownValues: q.IgnoreUnknownValues,
- MaxBadRecords: q.MaxBadRecords,
- Schema: bqToSchema(q.Schema),
- }
- switch {
- case q.CsvOptions != nil:
- e.Options = bqToCSVOptions(q.CsvOptions)
- case q.GoogleSheetsOptions != nil:
- e.Options = bqToGoogleSheetsOptions(q.GoogleSheetsOptions)
- case q.BigtableOptions != nil:
- var err error
- e.Options, err = bqToBigtableOptions(q.BigtableOptions)
- if err != nil {
- return nil, err
- }
- }
- return e, nil
- }
-
- // ExternalDataConfigOptions are additional options for external data configurations.
- // This interface is implemented by CSVOptions, GoogleSheetsOptions and BigtableOptions.
- type ExternalDataConfigOptions interface {
- populateExternalDataConfig(*bq.ExternalDataConfiguration)
- }
-
- // CSVOptions are additional options for CSV external data sources.
- type CSVOptions struct {
- // AllowJaggedRows causes missing trailing optional columns to be tolerated
- // when reading CSV data. Missing values are treated as nulls.
- AllowJaggedRows bool
-
- // AllowQuotedNewlines sets whether quoted data sections containing
- // newlines are allowed when reading CSV data.
- AllowQuotedNewlines bool
-
- // Encoding is the character encoding of data to be read.
- Encoding Encoding
-
- // FieldDelimiter is the separator for fields in a CSV file, used when
- // reading or exporting data. The default is ",".
- FieldDelimiter string
-
- // Quote is the value used to quote data sections in a CSV file. The
- // default quotation character is the double quote ("), which is used if
- // both Quote and ForceZeroQuote are unset.
- // To specify that no character should be interpreted as a quotation
- // character, set ForceZeroQuote to true.
- // Only used when reading data.
- Quote string
- ForceZeroQuote bool
-
- // The number of rows at the top of a CSV file that BigQuery will skip when
- // reading data.
- SkipLeadingRows int64
- }
-
- func (o *CSVOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
- c.CsvOptions = &bq.CsvOptions{
- AllowJaggedRows: o.AllowJaggedRows,
- AllowQuotedNewlines: o.AllowQuotedNewlines,
- Encoding: string(o.Encoding),
- FieldDelimiter: o.FieldDelimiter,
- Quote: o.quote(),
- SkipLeadingRows: o.SkipLeadingRows,
- }
- }
-
- // quote returns the CSV quote character, or nil if unset.
- func (o *CSVOptions) quote() *string {
- if o.ForceZeroQuote {
- quote := ""
- return "e
- }
- if o.Quote == "" {
- return nil
- }
- return &o.Quote
- }
-
- func (o *CSVOptions) setQuote(ps *string) {
- if ps != nil {
- o.Quote = *ps
- if o.Quote == "" {
- o.ForceZeroQuote = true
- }
- }
- }
-
- func bqToCSVOptions(q *bq.CsvOptions) *CSVOptions {
- o := &CSVOptions{
- AllowJaggedRows: q.AllowJaggedRows,
- AllowQuotedNewlines: q.AllowQuotedNewlines,
- Encoding: Encoding(q.Encoding),
- FieldDelimiter: q.FieldDelimiter,
- SkipLeadingRows: q.SkipLeadingRows,
- }
- o.setQuote(q.Quote)
- return o
- }
-
- // GoogleSheetsOptions are additional options for GoogleSheets external data sources.
- type GoogleSheetsOptions struct {
- // The number of rows at the top of a sheet that BigQuery will skip when
- // reading data.
- SkipLeadingRows int64
- }
-
- func (o *GoogleSheetsOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
- c.GoogleSheetsOptions = &bq.GoogleSheetsOptions{
- SkipLeadingRows: o.SkipLeadingRows,
- }
- }
-
- func bqToGoogleSheetsOptions(q *bq.GoogleSheetsOptions) *GoogleSheetsOptions {
- return &GoogleSheetsOptions{
- SkipLeadingRows: q.SkipLeadingRows,
- }
- }
-
- // BigtableOptions are additional options for Bigtable external data sources.
- type BigtableOptions struct {
- // A list of column families to expose in the table schema along with their
- // types. If omitted, all column families are present in the table schema and
- // their values are read as BYTES.
- ColumnFamilies []*BigtableColumnFamily
-
- // If true, then the column families that are not specified in columnFamilies
- // list are not exposed in the table schema. Otherwise, they are read with BYTES
- // type values. The default is false.
- IgnoreUnspecifiedColumnFamilies bool
-
- // If true, then the rowkey column families will be read and converted to string.
- // Otherwise they are read with BYTES type values and users need to manually cast
- // them with CAST if necessary. The default is false.
- ReadRowkeyAsString bool
- }
-
- func (o *BigtableOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
- q := &bq.BigtableOptions{
- IgnoreUnspecifiedColumnFamilies: o.IgnoreUnspecifiedColumnFamilies,
- ReadRowkeyAsString: o.ReadRowkeyAsString,
- }
- for _, f := range o.ColumnFamilies {
- q.ColumnFamilies = append(q.ColumnFamilies, f.toBQ())
- }
- c.BigtableOptions = q
- }
-
- func bqToBigtableOptions(q *bq.BigtableOptions) (*BigtableOptions, error) {
- b := &BigtableOptions{
- IgnoreUnspecifiedColumnFamilies: q.IgnoreUnspecifiedColumnFamilies,
- ReadRowkeyAsString: q.ReadRowkeyAsString,
- }
- for _, f := range q.ColumnFamilies {
- f2, err := bqToBigtableColumnFamily(f)
- if err != nil {
- return nil, err
- }
- b.ColumnFamilies = append(b.ColumnFamilies, f2)
- }
- return b, nil
- }
-
- // BigtableColumnFamily describes how BigQuery should access a Bigtable column family.
- type BigtableColumnFamily struct {
- // Identifier of the column family.
- FamilyID string
-
- // Lists of columns that should be exposed as individual fields as opposed to a
- // list of (column name, value) pairs. All columns whose qualifier matches a
- // qualifier in this list can be accessed as .. Other columns can be accessed as
- // a list through .Column field.
- Columns []*BigtableColumn
-
- // The encoding of the values when the type is not STRING. Acceptable encoding values are:
- // - TEXT - indicates values are alphanumeric text strings.
- // - BINARY - indicates values are encoded using HBase Bytes.toBytes family of functions.
- // This can be overridden for a specific column by listing that column in 'columns' and
- // specifying an encoding for it.
- Encoding string
-
- // If true, only the latest version of values are exposed for all columns in this
- // column family. This can be overridden for a specific column by listing that
- // column in 'columns' and specifying a different setting for that column.
- OnlyReadLatest bool
-
- // The type to convert the value in cells of this
- // column family. The values are expected to be encoded using HBase
- // Bytes.toBytes function when using the BINARY encoding value.
- // Following BigQuery types are allowed (case-sensitive):
- // BYTES STRING INTEGER FLOAT BOOLEAN.
- // The default type is BYTES. This can be overridden for a specific column by
- // listing that column in 'columns' and specifying a type for it.
- Type string
- }
-
- func (b *BigtableColumnFamily) toBQ() *bq.BigtableColumnFamily {
- q := &bq.BigtableColumnFamily{
- FamilyId: b.FamilyID,
- Encoding: b.Encoding,
- OnlyReadLatest: b.OnlyReadLatest,
- Type: b.Type,
- }
- for _, col := range b.Columns {
- q.Columns = append(q.Columns, col.toBQ())
- }
- return q
- }
-
- func bqToBigtableColumnFamily(q *bq.BigtableColumnFamily) (*BigtableColumnFamily, error) {
- b := &BigtableColumnFamily{
- FamilyID: q.FamilyId,
- Encoding: q.Encoding,
- OnlyReadLatest: q.OnlyReadLatest,
- Type: q.Type,
- }
- for _, col := range q.Columns {
- c, err := bqToBigtableColumn(col)
- if err != nil {
- return nil, err
- }
- b.Columns = append(b.Columns, c)
- }
- return b, nil
- }
-
- // BigtableColumn describes how BigQuery should access a Bigtable column.
- type BigtableColumn struct {
- // Qualifier of the column. Columns in the parent column family that have this
- // exact qualifier are exposed as . field. The column field name is the
- // same as the column qualifier.
- Qualifier string
-
- // If the qualifier is not a valid BigQuery field identifier i.e. does not match
- // [a-zA-Z][a-zA-Z0-9_]*, a valid identifier must be provided as the column field
- // name and is used as field name in queries.
- FieldName string
-
- // If true, only the latest version of values are exposed for this column.
- // See BigtableColumnFamily.OnlyReadLatest.
- OnlyReadLatest bool
-
- // The encoding of the values when the type is not STRING.
- // See BigtableColumnFamily.Encoding
- Encoding string
-
- // The type to convert the value in cells of this column.
- // See BigtableColumnFamily.Type
- Type string
- }
-
- func (b *BigtableColumn) toBQ() *bq.BigtableColumn {
- q := &bq.BigtableColumn{
- FieldName: b.FieldName,
- OnlyReadLatest: b.OnlyReadLatest,
- Encoding: b.Encoding,
- Type: b.Type,
- }
- if utf8.ValidString(b.Qualifier) {
- q.QualifierString = b.Qualifier
- } else {
- q.QualifierEncoded = base64.RawStdEncoding.EncodeToString([]byte(b.Qualifier))
- }
- return q
- }
-
- func bqToBigtableColumn(q *bq.BigtableColumn) (*BigtableColumn, error) {
- b := &BigtableColumn{
- FieldName: q.FieldName,
- OnlyReadLatest: q.OnlyReadLatest,
- Encoding: q.Encoding,
- Type: q.Type,
- }
- if q.QualifierString != "" {
- b.Qualifier = q.QualifierString
- } else {
- bytes, err := base64.RawStdEncoding.DecodeString(q.QualifierEncoded)
- if err != nil {
- return nil, err
- }
- b.Qualifier = string(bytes)
- }
- return b, nil
- }
|