You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

400 lines
13 KiB

  1. // Copyright 2017 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package bigquery
  15. import (
  16. "encoding/base64"
  17. "unicode/utf8"
  18. bq "google.golang.org/api/bigquery/v2"
  19. )
  20. // DataFormat describes the format of BigQuery table data.
  21. type DataFormat string
  22. // Constants describing the format of BigQuery table data.
  23. const (
  24. CSV DataFormat = "CSV"
  25. Avro DataFormat = "AVRO"
  26. JSON DataFormat = "NEWLINE_DELIMITED_JSON"
  27. DatastoreBackup DataFormat = "DATASTORE_BACKUP"
  28. GoogleSheets DataFormat = "GOOGLE_SHEETS"
  29. Bigtable DataFormat = "BIGTABLE"
  30. Parquet DataFormat = "PARQUET"
  31. )
  32. // ExternalData is a table which is stored outside of BigQuery. It is implemented by
  33. // *ExternalDataConfig.
  34. // GCSReference also implements it, for backwards compatibility.
  35. type ExternalData interface {
  36. toBQ() bq.ExternalDataConfiguration
  37. }
  38. // ExternalDataConfig describes data external to BigQuery that can be used
  39. // in queries and to create external tables.
  40. type ExternalDataConfig struct {
  41. // The format of the data. Required.
  42. SourceFormat DataFormat
  43. // The fully-qualified URIs that point to your
  44. // data in Google Cloud. Required.
  45. //
  46. // For Google Cloud Storage URIs, each URI can contain one '*' wildcard character
  47. // and it must come after the 'bucket' name. Size limits related to load jobs
  48. // apply to external data sources.
  49. //
  50. // For Google Cloud Bigtable URIs, exactly one URI can be specified and it has be
  51. // a fully specified and valid HTTPS URL for a Google Cloud Bigtable table.
  52. //
  53. // For Google Cloud Datastore backups, exactly one URI can be specified. Also,
  54. // the '*' wildcard character is not allowed.
  55. SourceURIs []string
  56. // The schema of the data. Required for CSV and JSON; disallowed for the
  57. // other formats.
  58. Schema Schema
  59. // Try to detect schema and format options automatically.
  60. // Any option specified explicitly will be honored.
  61. AutoDetect bool
  62. // The compression type of the data.
  63. Compression Compression
  64. // IgnoreUnknownValues causes values not matching the schema to be
  65. // tolerated. Unknown values are ignored. For CSV this ignores extra values
  66. // at the end of a line. For JSON this ignores named values that do not
  67. // match any column name. If this field is not set, records containing
  68. // unknown values are treated as bad records. The MaxBadRecords field can
  69. // be used to customize how bad records are handled.
  70. IgnoreUnknownValues bool
  71. // MaxBadRecords is the maximum number of bad records that will be ignored
  72. // when reading data.
  73. MaxBadRecords int64
  74. // Additional options for CSV, GoogleSheets and Bigtable formats.
  75. Options ExternalDataConfigOptions
  76. }
  77. func (e *ExternalDataConfig) toBQ() bq.ExternalDataConfiguration {
  78. q := bq.ExternalDataConfiguration{
  79. SourceFormat: string(e.SourceFormat),
  80. SourceUris: e.SourceURIs,
  81. Autodetect: e.AutoDetect,
  82. Compression: string(e.Compression),
  83. IgnoreUnknownValues: e.IgnoreUnknownValues,
  84. MaxBadRecords: e.MaxBadRecords,
  85. }
  86. if e.Schema != nil {
  87. q.Schema = e.Schema.toBQ()
  88. }
  89. if e.Options != nil {
  90. e.Options.populateExternalDataConfig(&q)
  91. }
  92. return q
  93. }
  94. func bqToExternalDataConfig(q *bq.ExternalDataConfiguration) (*ExternalDataConfig, error) {
  95. e := &ExternalDataConfig{
  96. SourceFormat: DataFormat(q.SourceFormat),
  97. SourceURIs: q.SourceUris,
  98. AutoDetect: q.Autodetect,
  99. Compression: Compression(q.Compression),
  100. IgnoreUnknownValues: q.IgnoreUnknownValues,
  101. MaxBadRecords: q.MaxBadRecords,
  102. Schema: bqToSchema(q.Schema),
  103. }
  104. switch {
  105. case q.CsvOptions != nil:
  106. e.Options = bqToCSVOptions(q.CsvOptions)
  107. case q.GoogleSheetsOptions != nil:
  108. e.Options = bqToGoogleSheetsOptions(q.GoogleSheetsOptions)
  109. case q.BigtableOptions != nil:
  110. var err error
  111. e.Options, err = bqToBigtableOptions(q.BigtableOptions)
  112. if err != nil {
  113. return nil, err
  114. }
  115. }
  116. return e, nil
  117. }
  118. // ExternalDataConfigOptions are additional options for external data configurations.
  119. // This interface is implemented by CSVOptions, GoogleSheetsOptions and BigtableOptions.
  120. type ExternalDataConfigOptions interface {
  121. populateExternalDataConfig(*bq.ExternalDataConfiguration)
  122. }
  123. // CSVOptions are additional options for CSV external data sources.
  124. type CSVOptions struct {
  125. // AllowJaggedRows causes missing trailing optional columns to be tolerated
  126. // when reading CSV data. Missing values are treated as nulls.
  127. AllowJaggedRows bool
  128. // AllowQuotedNewlines sets whether quoted data sections containing
  129. // newlines are allowed when reading CSV data.
  130. AllowQuotedNewlines bool
  131. // Encoding is the character encoding of data to be read.
  132. Encoding Encoding
  133. // FieldDelimiter is the separator for fields in a CSV file, used when
  134. // reading or exporting data. The default is ",".
  135. FieldDelimiter string
  136. // Quote is the value used to quote data sections in a CSV file. The
  137. // default quotation character is the double quote ("), which is used if
  138. // both Quote and ForceZeroQuote are unset.
  139. // To specify that no character should be interpreted as a quotation
  140. // character, set ForceZeroQuote to true.
  141. // Only used when reading data.
  142. Quote string
  143. ForceZeroQuote bool
  144. // The number of rows at the top of a CSV file that BigQuery will skip when
  145. // reading data.
  146. SkipLeadingRows int64
  147. }
  148. func (o *CSVOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
  149. c.CsvOptions = &bq.CsvOptions{
  150. AllowJaggedRows: o.AllowJaggedRows,
  151. AllowQuotedNewlines: o.AllowQuotedNewlines,
  152. Encoding: string(o.Encoding),
  153. FieldDelimiter: o.FieldDelimiter,
  154. Quote: o.quote(),
  155. SkipLeadingRows: o.SkipLeadingRows,
  156. }
  157. }
  158. // quote returns the CSV quote character, or nil if unset.
  159. func (o *CSVOptions) quote() *string {
  160. if o.ForceZeroQuote {
  161. quote := ""
  162. return &quote
  163. }
  164. if o.Quote == "" {
  165. return nil
  166. }
  167. return &o.Quote
  168. }
  169. func (o *CSVOptions) setQuote(ps *string) {
  170. if ps != nil {
  171. o.Quote = *ps
  172. if o.Quote == "" {
  173. o.ForceZeroQuote = true
  174. }
  175. }
  176. }
  177. func bqToCSVOptions(q *bq.CsvOptions) *CSVOptions {
  178. o := &CSVOptions{
  179. AllowJaggedRows: q.AllowJaggedRows,
  180. AllowQuotedNewlines: q.AllowQuotedNewlines,
  181. Encoding: Encoding(q.Encoding),
  182. FieldDelimiter: q.FieldDelimiter,
  183. SkipLeadingRows: q.SkipLeadingRows,
  184. }
  185. o.setQuote(q.Quote)
  186. return o
  187. }
  188. // GoogleSheetsOptions are additional options for GoogleSheets external data sources.
  189. type GoogleSheetsOptions struct {
  190. // The number of rows at the top of a sheet that BigQuery will skip when
  191. // reading data.
  192. SkipLeadingRows int64
  193. }
  194. func (o *GoogleSheetsOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
  195. c.GoogleSheetsOptions = &bq.GoogleSheetsOptions{
  196. SkipLeadingRows: o.SkipLeadingRows,
  197. }
  198. }
  199. func bqToGoogleSheetsOptions(q *bq.GoogleSheetsOptions) *GoogleSheetsOptions {
  200. return &GoogleSheetsOptions{
  201. SkipLeadingRows: q.SkipLeadingRows,
  202. }
  203. }
  204. // BigtableOptions are additional options for Bigtable external data sources.
  205. type BigtableOptions struct {
  206. // A list of column families to expose in the table schema along with their
  207. // types. If omitted, all column families are present in the table schema and
  208. // their values are read as BYTES.
  209. ColumnFamilies []*BigtableColumnFamily
  210. // If true, then the column families that are not specified in columnFamilies
  211. // list are not exposed in the table schema. Otherwise, they are read with BYTES
  212. // type values. The default is false.
  213. IgnoreUnspecifiedColumnFamilies bool
  214. // If true, then the rowkey column families will be read and converted to string.
  215. // Otherwise they are read with BYTES type values and users need to manually cast
  216. // them with CAST if necessary. The default is false.
  217. ReadRowkeyAsString bool
  218. }
  219. func (o *BigtableOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
  220. q := &bq.BigtableOptions{
  221. IgnoreUnspecifiedColumnFamilies: o.IgnoreUnspecifiedColumnFamilies,
  222. ReadRowkeyAsString: o.ReadRowkeyAsString,
  223. }
  224. for _, f := range o.ColumnFamilies {
  225. q.ColumnFamilies = append(q.ColumnFamilies, f.toBQ())
  226. }
  227. c.BigtableOptions = q
  228. }
  229. func bqToBigtableOptions(q *bq.BigtableOptions) (*BigtableOptions, error) {
  230. b := &BigtableOptions{
  231. IgnoreUnspecifiedColumnFamilies: q.IgnoreUnspecifiedColumnFamilies,
  232. ReadRowkeyAsString: q.ReadRowkeyAsString,
  233. }
  234. for _, f := range q.ColumnFamilies {
  235. f2, err := bqToBigtableColumnFamily(f)
  236. if err != nil {
  237. return nil, err
  238. }
  239. b.ColumnFamilies = append(b.ColumnFamilies, f2)
  240. }
  241. return b, nil
  242. }
  243. // BigtableColumnFamily describes how BigQuery should access a Bigtable column family.
  244. type BigtableColumnFamily struct {
  245. // Identifier of the column family.
  246. FamilyID string
  247. // Lists of columns that should be exposed as individual fields as opposed to a
  248. // list of (column name, value) pairs. All columns whose qualifier matches a
  249. // qualifier in this list can be accessed as .. Other columns can be accessed as
  250. // a list through .Column field.
  251. Columns []*BigtableColumn
  252. // The encoding of the values when the type is not STRING. Acceptable encoding values are:
  253. // - TEXT - indicates values are alphanumeric text strings.
  254. // - BINARY - indicates values are encoded using HBase Bytes.toBytes family of functions.
  255. // This can be overridden for a specific column by listing that column in 'columns' and
  256. // specifying an encoding for it.
  257. Encoding string
  258. // If true, only the latest version of values are exposed for all columns in this
  259. // column family. This can be overridden for a specific column by listing that
  260. // column in 'columns' and specifying a different setting for that column.
  261. OnlyReadLatest bool
  262. // The type to convert the value in cells of this
  263. // column family. The values are expected to be encoded using HBase
  264. // Bytes.toBytes function when using the BINARY encoding value.
  265. // Following BigQuery types are allowed (case-sensitive):
  266. // BYTES STRING INTEGER FLOAT BOOLEAN.
  267. // The default type is BYTES. This can be overridden for a specific column by
  268. // listing that column in 'columns' and specifying a type for it.
  269. Type string
  270. }
  271. func (b *BigtableColumnFamily) toBQ() *bq.BigtableColumnFamily {
  272. q := &bq.BigtableColumnFamily{
  273. FamilyId: b.FamilyID,
  274. Encoding: b.Encoding,
  275. OnlyReadLatest: b.OnlyReadLatest,
  276. Type: b.Type,
  277. }
  278. for _, col := range b.Columns {
  279. q.Columns = append(q.Columns, col.toBQ())
  280. }
  281. return q
  282. }
  283. func bqToBigtableColumnFamily(q *bq.BigtableColumnFamily) (*BigtableColumnFamily, error) {
  284. b := &BigtableColumnFamily{
  285. FamilyID: q.FamilyId,
  286. Encoding: q.Encoding,
  287. OnlyReadLatest: q.OnlyReadLatest,
  288. Type: q.Type,
  289. }
  290. for _, col := range q.Columns {
  291. c, err := bqToBigtableColumn(col)
  292. if err != nil {
  293. return nil, err
  294. }
  295. b.Columns = append(b.Columns, c)
  296. }
  297. return b, nil
  298. }
  299. // BigtableColumn describes how BigQuery should access a Bigtable column.
  300. type BigtableColumn struct {
  301. // Qualifier of the column. Columns in the parent column family that have this
  302. // exact qualifier are exposed as . field. The column field name is the
  303. // same as the column qualifier.
  304. Qualifier string
  305. // If the qualifier is not a valid BigQuery field identifier i.e. does not match
  306. // [a-zA-Z][a-zA-Z0-9_]*, a valid identifier must be provided as the column field
  307. // name and is used as field name in queries.
  308. FieldName string
  309. // If true, only the latest version of values are exposed for this column.
  310. // See BigtableColumnFamily.OnlyReadLatest.
  311. OnlyReadLatest bool
  312. // The encoding of the values when the type is not STRING.
  313. // See BigtableColumnFamily.Encoding
  314. Encoding string
  315. // The type to convert the value in cells of this column.
  316. // See BigtableColumnFamily.Type
  317. Type string
  318. }
  319. func (b *BigtableColumn) toBQ() *bq.BigtableColumn {
  320. q := &bq.BigtableColumn{
  321. FieldName: b.FieldName,
  322. OnlyReadLatest: b.OnlyReadLatest,
  323. Encoding: b.Encoding,
  324. Type: b.Type,
  325. }
  326. if utf8.ValidString(b.Qualifier) {
  327. q.QualifierString = b.Qualifier
  328. } else {
  329. q.QualifierEncoded = base64.RawStdEncoding.EncodeToString([]byte(b.Qualifier))
  330. }
  331. return q
  332. }
  333. func bqToBigtableColumn(q *bq.BigtableColumn) (*BigtableColumn, error) {
  334. b := &BigtableColumn{
  335. FieldName: q.FieldName,
  336. OnlyReadLatest: q.OnlyReadLatest,
  337. Encoding: q.Encoding,
  338. Type: q.Type,
  339. }
  340. if q.QualifierString != "" {
  341. b.Qualifier = q.QualifierString
  342. } else {
  343. bytes, err := base64.RawStdEncoding.DecodeString(q.QualifierEncoded)
  344. if err != nil {
  345. return nil, err
  346. }
  347. b.Qualifier = string(bytes)
  348. }
  349. return b, nil
  350. }