You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

401 lines
13 KiB

  1. // Copyright 2017 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package bigquery
  15. import (
  16. "encoding/base64"
  17. "unicode/utf8"
  18. bq "google.golang.org/api/bigquery/v2"
  19. )
  20. // DataFormat describes the format of BigQuery table data.
  21. type DataFormat string
  22. // Constants describing the format of BigQuery table data.
  23. const (
  24. CSV DataFormat = "CSV"
  25. Avro DataFormat = "AVRO"
  26. JSON DataFormat = "NEWLINE_DELIMITED_JSON"
  27. DatastoreBackup DataFormat = "DATASTORE_BACKUP"
  28. GoogleSheets DataFormat = "GOOGLE_SHEETS"
  29. Bigtable DataFormat = "BIGTABLE"
  30. Parquet DataFormat = "PARQUET"
  31. ORC DataFormat = "ORC"
  32. )
  33. // ExternalData is a table which is stored outside of BigQuery. It is implemented by
  34. // *ExternalDataConfig.
  35. // GCSReference also implements it, for backwards compatibility.
  36. type ExternalData interface {
  37. toBQ() bq.ExternalDataConfiguration
  38. }
  39. // ExternalDataConfig describes data external to BigQuery that can be used
  40. // in queries and to create external tables.
  41. type ExternalDataConfig struct {
  42. // The format of the data. Required.
  43. SourceFormat DataFormat
  44. // The fully-qualified URIs that point to your
  45. // data in Google Cloud. Required.
  46. //
  47. // For Google Cloud Storage URIs, each URI can contain one '*' wildcard character
  48. // and it must come after the 'bucket' name. Size limits related to load jobs
  49. // apply to external data sources.
  50. //
  51. // For Google Cloud Bigtable URIs, exactly one URI can be specified and it has be
  52. // a fully specified and valid HTTPS URL for a Google Cloud Bigtable table.
  53. //
  54. // For Google Cloud Datastore backups, exactly one URI can be specified. Also,
  55. // the '*' wildcard character is not allowed.
  56. SourceURIs []string
  57. // The schema of the data. Required for CSV and JSON; disallowed for the
  58. // other formats.
  59. Schema Schema
  60. // Try to detect schema and format options automatically.
  61. // Any option specified explicitly will be honored.
  62. AutoDetect bool
  63. // The compression type of the data.
  64. Compression Compression
  65. // IgnoreUnknownValues causes values not matching the schema to be
  66. // tolerated. Unknown values are ignored. For CSV this ignores extra values
  67. // at the end of a line. For JSON this ignores named values that do not
  68. // match any column name. If this field is not set, records containing
  69. // unknown values are treated as bad records. The MaxBadRecords field can
  70. // be used to customize how bad records are handled.
  71. IgnoreUnknownValues bool
  72. // MaxBadRecords is the maximum number of bad records that will be ignored
  73. // when reading data.
  74. MaxBadRecords int64
  75. // Additional options for CSV, GoogleSheets and Bigtable formats.
  76. Options ExternalDataConfigOptions
  77. }
  78. func (e *ExternalDataConfig) toBQ() bq.ExternalDataConfiguration {
  79. q := bq.ExternalDataConfiguration{
  80. SourceFormat: string(e.SourceFormat),
  81. SourceUris: e.SourceURIs,
  82. Autodetect: e.AutoDetect,
  83. Compression: string(e.Compression),
  84. IgnoreUnknownValues: e.IgnoreUnknownValues,
  85. MaxBadRecords: e.MaxBadRecords,
  86. }
  87. if e.Schema != nil {
  88. q.Schema = e.Schema.toBQ()
  89. }
  90. if e.Options != nil {
  91. e.Options.populateExternalDataConfig(&q)
  92. }
  93. return q
  94. }
  95. func bqToExternalDataConfig(q *bq.ExternalDataConfiguration) (*ExternalDataConfig, error) {
  96. e := &ExternalDataConfig{
  97. SourceFormat: DataFormat(q.SourceFormat),
  98. SourceURIs: q.SourceUris,
  99. AutoDetect: q.Autodetect,
  100. Compression: Compression(q.Compression),
  101. IgnoreUnknownValues: q.IgnoreUnknownValues,
  102. MaxBadRecords: q.MaxBadRecords,
  103. Schema: bqToSchema(q.Schema),
  104. }
  105. switch {
  106. case q.CsvOptions != nil:
  107. e.Options = bqToCSVOptions(q.CsvOptions)
  108. case q.GoogleSheetsOptions != nil:
  109. e.Options = bqToGoogleSheetsOptions(q.GoogleSheetsOptions)
  110. case q.BigtableOptions != nil:
  111. var err error
  112. e.Options, err = bqToBigtableOptions(q.BigtableOptions)
  113. if err != nil {
  114. return nil, err
  115. }
  116. }
  117. return e, nil
  118. }
  119. // ExternalDataConfigOptions are additional options for external data configurations.
  120. // This interface is implemented by CSVOptions, GoogleSheetsOptions and BigtableOptions.
  121. type ExternalDataConfigOptions interface {
  122. populateExternalDataConfig(*bq.ExternalDataConfiguration)
  123. }
  124. // CSVOptions are additional options for CSV external data sources.
  125. type CSVOptions struct {
  126. // AllowJaggedRows causes missing trailing optional columns to be tolerated
  127. // when reading CSV data. Missing values are treated as nulls.
  128. AllowJaggedRows bool
  129. // AllowQuotedNewlines sets whether quoted data sections containing
  130. // newlines are allowed when reading CSV data.
  131. AllowQuotedNewlines bool
  132. // Encoding is the character encoding of data to be read.
  133. Encoding Encoding
  134. // FieldDelimiter is the separator for fields in a CSV file, used when
  135. // reading or exporting data. The default is ",".
  136. FieldDelimiter string
  137. // Quote is the value used to quote data sections in a CSV file. The
  138. // default quotation character is the double quote ("), which is used if
  139. // both Quote and ForceZeroQuote are unset.
  140. // To specify that no character should be interpreted as a quotation
  141. // character, set ForceZeroQuote to true.
  142. // Only used when reading data.
  143. Quote string
  144. ForceZeroQuote bool
  145. // The number of rows at the top of a CSV file that BigQuery will skip when
  146. // reading data.
  147. SkipLeadingRows int64
  148. }
  149. func (o *CSVOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
  150. c.CsvOptions = &bq.CsvOptions{
  151. AllowJaggedRows: o.AllowJaggedRows,
  152. AllowQuotedNewlines: o.AllowQuotedNewlines,
  153. Encoding: string(o.Encoding),
  154. FieldDelimiter: o.FieldDelimiter,
  155. Quote: o.quote(),
  156. SkipLeadingRows: o.SkipLeadingRows,
  157. }
  158. }
  159. // quote returns the CSV quote character, or nil if unset.
  160. func (o *CSVOptions) quote() *string {
  161. if o.ForceZeroQuote {
  162. quote := ""
  163. return &quote
  164. }
  165. if o.Quote == "" {
  166. return nil
  167. }
  168. return &o.Quote
  169. }
  170. func (o *CSVOptions) setQuote(ps *string) {
  171. if ps != nil {
  172. o.Quote = *ps
  173. if o.Quote == "" {
  174. o.ForceZeroQuote = true
  175. }
  176. }
  177. }
  178. func bqToCSVOptions(q *bq.CsvOptions) *CSVOptions {
  179. o := &CSVOptions{
  180. AllowJaggedRows: q.AllowJaggedRows,
  181. AllowQuotedNewlines: q.AllowQuotedNewlines,
  182. Encoding: Encoding(q.Encoding),
  183. FieldDelimiter: q.FieldDelimiter,
  184. SkipLeadingRows: q.SkipLeadingRows,
  185. }
  186. o.setQuote(q.Quote)
  187. return o
  188. }
  189. // GoogleSheetsOptions are additional options for GoogleSheets external data sources.
  190. type GoogleSheetsOptions struct {
  191. // The number of rows at the top of a sheet that BigQuery will skip when
  192. // reading data.
  193. SkipLeadingRows int64
  194. }
  195. func (o *GoogleSheetsOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
  196. c.GoogleSheetsOptions = &bq.GoogleSheetsOptions{
  197. SkipLeadingRows: o.SkipLeadingRows,
  198. }
  199. }
  200. func bqToGoogleSheetsOptions(q *bq.GoogleSheetsOptions) *GoogleSheetsOptions {
  201. return &GoogleSheetsOptions{
  202. SkipLeadingRows: q.SkipLeadingRows,
  203. }
  204. }
  205. // BigtableOptions are additional options for Bigtable external data sources.
  206. type BigtableOptions struct {
  207. // A list of column families to expose in the table schema along with their
  208. // types. If omitted, all column families are present in the table schema and
  209. // their values are read as BYTES.
  210. ColumnFamilies []*BigtableColumnFamily
  211. // If true, then the column families that are not specified in columnFamilies
  212. // list are not exposed in the table schema. Otherwise, they are read with BYTES
  213. // type values. The default is false.
  214. IgnoreUnspecifiedColumnFamilies bool
  215. // If true, then the rowkey column families will be read and converted to string.
  216. // Otherwise they are read with BYTES type values and users need to manually cast
  217. // them with CAST if necessary. The default is false.
  218. ReadRowkeyAsString bool
  219. }
  220. func (o *BigtableOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
  221. q := &bq.BigtableOptions{
  222. IgnoreUnspecifiedColumnFamilies: o.IgnoreUnspecifiedColumnFamilies,
  223. ReadRowkeyAsString: o.ReadRowkeyAsString,
  224. }
  225. for _, f := range o.ColumnFamilies {
  226. q.ColumnFamilies = append(q.ColumnFamilies, f.toBQ())
  227. }
  228. c.BigtableOptions = q
  229. }
  230. func bqToBigtableOptions(q *bq.BigtableOptions) (*BigtableOptions, error) {
  231. b := &BigtableOptions{
  232. IgnoreUnspecifiedColumnFamilies: q.IgnoreUnspecifiedColumnFamilies,
  233. ReadRowkeyAsString: q.ReadRowkeyAsString,
  234. }
  235. for _, f := range q.ColumnFamilies {
  236. f2, err := bqToBigtableColumnFamily(f)
  237. if err != nil {
  238. return nil, err
  239. }
  240. b.ColumnFamilies = append(b.ColumnFamilies, f2)
  241. }
  242. return b, nil
  243. }
  244. // BigtableColumnFamily describes how BigQuery should access a Bigtable column family.
  245. type BigtableColumnFamily struct {
  246. // Identifier of the column family.
  247. FamilyID string
  248. // Lists of columns that should be exposed as individual fields as opposed to a
  249. // list of (column name, value) pairs. All columns whose qualifier matches a
  250. // qualifier in this list can be accessed as .. Other columns can be accessed as
  251. // a list through .Column field.
  252. Columns []*BigtableColumn
  253. // The encoding of the values when the type is not STRING. Acceptable encoding values are:
  254. // - TEXT - indicates values are alphanumeric text strings.
  255. // - BINARY - indicates values are encoded using HBase Bytes.toBytes family of functions.
  256. // This can be overridden for a specific column by listing that column in 'columns' and
  257. // specifying an encoding for it.
  258. Encoding string
  259. // If true, only the latest version of values are exposed for all columns in this
  260. // column family. This can be overridden for a specific column by listing that
  261. // column in 'columns' and specifying a different setting for that column.
  262. OnlyReadLatest bool
  263. // The type to convert the value in cells of this
  264. // column family. The values are expected to be encoded using HBase
  265. // Bytes.toBytes function when using the BINARY encoding value.
  266. // Following BigQuery types are allowed (case-sensitive):
  267. // BYTES STRING INTEGER FLOAT BOOLEAN.
  268. // The default type is BYTES. This can be overridden for a specific column by
  269. // listing that column in 'columns' and specifying a type for it.
  270. Type string
  271. }
  272. func (b *BigtableColumnFamily) toBQ() *bq.BigtableColumnFamily {
  273. q := &bq.BigtableColumnFamily{
  274. FamilyId: b.FamilyID,
  275. Encoding: b.Encoding,
  276. OnlyReadLatest: b.OnlyReadLatest,
  277. Type: b.Type,
  278. }
  279. for _, col := range b.Columns {
  280. q.Columns = append(q.Columns, col.toBQ())
  281. }
  282. return q
  283. }
  284. func bqToBigtableColumnFamily(q *bq.BigtableColumnFamily) (*BigtableColumnFamily, error) {
  285. b := &BigtableColumnFamily{
  286. FamilyID: q.FamilyId,
  287. Encoding: q.Encoding,
  288. OnlyReadLatest: q.OnlyReadLatest,
  289. Type: q.Type,
  290. }
  291. for _, col := range q.Columns {
  292. c, err := bqToBigtableColumn(col)
  293. if err != nil {
  294. return nil, err
  295. }
  296. b.Columns = append(b.Columns, c)
  297. }
  298. return b, nil
  299. }
  300. // BigtableColumn describes how BigQuery should access a Bigtable column.
  301. type BigtableColumn struct {
  302. // Qualifier of the column. Columns in the parent column family that have this
  303. // exact qualifier are exposed as . field. The column field name is the
  304. // same as the column qualifier.
  305. Qualifier string
  306. // If the qualifier is not a valid BigQuery field identifier i.e. does not match
  307. // [a-zA-Z][a-zA-Z0-9_]*, a valid identifier must be provided as the column field
  308. // name and is used as field name in queries.
  309. FieldName string
  310. // If true, only the latest version of values are exposed for this column.
  311. // See BigtableColumnFamily.OnlyReadLatest.
  312. OnlyReadLatest bool
  313. // The encoding of the values when the type is not STRING.
  314. // See BigtableColumnFamily.Encoding
  315. Encoding string
  316. // The type to convert the value in cells of this column.
  317. // See BigtableColumnFamily.Type
  318. Type string
  319. }
  320. func (b *BigtableColumn) toBQ() *bq.BigtableColumn {
  321. q := &bq.BigtableColumn{
  322. FieldName: b.FieldName,
  323. OnlyReadLatest: b.OnlyReadLatest,
  324. Encoding: b.Encoding,
  325. Type: b.Type,
  326. }
  327. if utf8.ValidString(b.Qualifier) {
  328. q.QualifierString = b.Qualifier
  329. } else {
  330. q.QualifierEncoded = base64.RawStdEncoding.EncodeToString([]byte(b.Qualifier))
  331. }
  332. return q
  333. }
  334. func bqToBigtableColumn(q *bq.BigtableColumn) (*BigtableColumn, error) {
  335. b := &BigtableColumn{
  336. FieldName: q.FieldName,
  337. OnlyReadLatest: q.OnlyReadLatest,
  338. Encoding: q.Encoding,
  339. Type: q.Type,
  340. }
  341. if q.QualifierString != "" {
  342. b.Qualifier = q.QualifierString
  343. } else {
  344. bytes, err := base64.RawStdEncoding.DecodeString(q.QualifierEncoded)
  345. if err != nil {
  346. return nil, err
  347. }
  348. b.Qualifier = string(bytes)
  349. }
  350. return b, nil
  351. }