// Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package main import ( "container/list" "encoding/json" "fmt" "io/ioutil" "log" "math" "math/rand" "net/http" "os" "strconv" "strings" "time" bigquery "google.golang.org/api/bigquery/v2" storage "google.golang.org/api/storage/v1" ) const ( GB = 1 << 30 MaxBackoff = 30000 BaseBackoff = 250 BackoffGrowthFactor = 1.8 BackoffGrowthDamper = 0.25 JobStatusDone = "DONE" DatasetAlreadyExists = "Already Exists: Dataset" TableWriteEmptyDisposition = "WRITE_EMPTY" ) func init() { scope := fmt.Sprintf("%s %s %s", bigquery.BigqueryScope, storage.DevstorageReadOnlyScope, "https://www.googleapis.com/auth/userinfo.profile") registerDemo("bigquery", scope, bqMain) } // This example demonstrates loading objects from Google Cloud Storage into // BigQuery. Objects are specified by their bucket and a name prefix. Each // object will be loaded into a new table identified by the object name minus // any file extension. All tables are added to the specified dataset (one will // be created if necessary). Currently, tables will not be overwritten and an // attempt to load an object into a dataset that already contains its table // will emit an error message indicating the table already exists. // A schema file must be provided and it will be applied to every object/table. // Example usage: // go-api-demo -clientid="my-clientid" -secret="my-secret" bq myProject // myDataBucket datafile2013070 DataFiles2013 // ./datafile_schema.json 100 // // This will load all objects (e.g. all data files from July 2013) from // gs://myDataBucket into a (possibly new) BigQuery dataset named DataFiles2013 // using the schema file provided and allowing up to 100 bad records. Assuming // each object is named like datafileYYYYMMDD.csv.gz and all of July's files are // stored in the bucket, 9 tables will be created named like datafile201307DD // where DD ranges from 01 to 09, inclusive. // When the program completes, it will emit a results line similar to: // // 9 files loaded in 3m58s (18m2.708s). Size: 7.18GB Rows: 7130725 // // The total elapsed time from the start of first job to the end of the last job // (effectively wall clock time) is shown. In parenthesis is the aggregate time // taken to load all tables. func bqMain(client *http.Client, argv []string) { if len(argv) != 6 { fmt.Fprintln(os.Stderr, "Usage: bq project_id bucket prefix dataset schema max_bad_records") return } var ( project = argv[0] bucket = argv[1] objPrefix = argv[2] datasetId = argv[3] schemaFile = argv[4] ) badRecords, err := strconv.ParseInt(argv[5], 10, 64) if err != nil { fmt.Fprintln(os.Stderr, err) return } rand.Seed(time.Now().UnixNano()) service, err := storage.New(client) if err != nil { log.Fatalf("Unable to create Storage service: %v", err) } // Get the list of objects in the bucket matching the specified prefix. list := service.Objects.List(bucket) list.Prefix(objPrefix) objects, err := list.Do() if err != nil { fmt.Fprintln(os.Stderr, err) return } // Create the wrapper and insert the (new) dataset. dataset, err := newBQDataset(client, project, datasetId) if err != nil { fmt.Fprintln(os.Stderr, err) return } if err = dataset.insert(true); err != nil { fmt.Fprintln(os.Stderr, err) return } objectSource := &tableSource{ maxBadRecords: badRecords, disposition: TableWriteEmptyDisposition, } // Load the schema from disk. f, err := ioutil.ReadFile(schemaFile) if err != nil { fmt.Fprintln(os.Stderr, err) return } if err = json.Unmarshal(f, &objectSource.schema); err != nil { fmt.Fprintln(os.Stderr, err) return } // Assumes all objects have .csv, .csv.gz (or no) extension. tableIdFromObject := func(name string) string { return strings.TrimSuffix(strings.TrimSuffix(name, ".gz"), ".csv") } // A jobset is way to group a collection of jobs together for monitoring. // For this example, we just use the name of the bucket and object prefix. jobset := fmt.Sprintf("%s:%s", bucket, objPrefix) fmt.Fprintf(os.Stderr, "\nLoading %d objects.\n", len(objects.Items)) // Load each object into a dataset of the same name (minus any extension). // A successful insert call will inject the job into our queue for monitoring. for _, o := range objects.Items { objectSource.id = tableIdFromObject(o.Name) objectSource.uri = fmt.Sprintf("gs://%s/%s", o.Bucket, o.Name) if err = dataset.load(jobset, objectSource); err != nil { fmt.Fprintln(os.Stderr, err) } } dataset.monitor(jobset) } // Wraps the BigQuery service and dataset and provides some helper functions. type bqDataset struct { project string id string bq *bigquery.Service dataset *bigquery.Dataset jobsets map[string]*list.List } func newBQDataset(client *http.Client, dsProj string, dsId string) (*bqDataset, error) { service, err := bigquery.New(client) if err != nil { log.Fatalf("Unable to create BigQuery service: %v", err) } return &bqDataset{ project: dsProj, id: dsId, bq: service, dataset: &bigquery.Dataset{ DatasetReference: &bigquery.DatasetReference{ DatasetId: dsId, ProjectId: dsProj, }, }, jobsets: make(map[string]*list.List), }, nil } func (ds *bqDataset) insert(existsOK bool) error { call := ds.bq.Datasets.Insert(ds.project, ds.dataset) _, err := call.Do() if err != nil && (!existsOK || !strings.Contains(err.Error(), DatasetAlreadyExists)) { return err } return nil } type tableSource struct { id string uri string schema bigquery.TableSchema maxBadRecords int64 disposition string } func (ds *bqDataset) load(jobset string, source *tableSource) error { job := &bigquery.Job{ Configuration: &bigquery.JobConfiguration{ Load: &bigquery.JobConfigurationLoad{ DestinationTable: &bigquery.TableReference{ DatasetId: ds.dataset.DatasetReference.DatasetId, ProjectId: ds.project, TableId: source.id, }, MaxBadRecords: source.maxBadRecords, Schema: &source.schema, SourceUris: []string{source.uri}, WriteDisposition: source.disposition, }, }, } call := ds.bq.Jobs.Insert(ds.project, job) job, err := call.Do() if err != nil { return err } _, ok := ds.jobsets[jobset] if !ok { ds.jobsets[jobset] = list.New() } ds.jobsets[jobset].PushBack(job) return nil } func (ds *bqDataset) getJob(id string) (*bigquery.Job, error) { return ds.bq.Jobs.Get(ds.project, id).Do() } func (ds *bqDataset) monitor(jobset string) { jobq, ok := ds.jobsets[jobset] if !ok { return } var backoff float64 = BaseBackoff pause := func(grow bool) { if grow { backoff *= BackoffGrowthFactor backoff -= (backoff * rand.Float64() * BackoffGrowthDamper) backoff = math.Min(backoff, MaxBackoff) fmt.Fprintf(os.Stderr, "[%s] Checking remaining %d jobs...\n", jobset, 1+jobq.Len()) } time.Sleep(time.Duration(backoff) * time.Millisecond) } var stats jobStats // Track a 'head' pending job in queue for detecting cycling. head := "" // Loop until all jobs are done - with either success or error. for jobq.Len() > 0 { jel := jobq.Front() job := jel.Value.(*bigquery.Job) jobq.Remove(jel) jid := job.JobReference.JobId loop := false // Check and possibly pick a new head job id. if len(head) == 0 { head = jid } else { if jid == head { loop = true } } // Retrieve the job's current status. pause(loop) j, err := ds.getJob(jid) if err != nil { fmt.Fprintln(os.Stderr, err) // In this case of a transient API error, we want keep the job. if j == nil { jobq.PushBack(job) } else { // Must reset head tracker if job is discarded. if loop { head = "" backoff = BaseBackoff } } continue } // Reassign with the updated job data (from Get). // We don't use j here as Get might return nil for this value. job = j if job.Status.State != JobStatusDone { jobq.PushBack(job) continue } if res := job.Status.ErrorResult; res != nil { fmt.Fprintln(os.Stderr, res.Message) } else { stat := job.Statistics lstat := stat.Load stats.files += 1 stats.bytesIn += lstat.InputFileBytes stats.bytesOut += lstat.OutputBytes stats.rows += lstat.OutputRows stats.elapsed += time.Duration(stat.EndTime-stat.StartTime) * time.Millisecond if stats.start.IsZero() { stats.start = time.Unix(stat.StartTime/1000, 0) } else { t := time.Unix(stat.StartTime/1000, 0) if stats.start.Sub(t) > 0 { stats.start = t } } if stats.finish.IsZero() { stats.finish = time.Unix(stat.EndTime/1000, 0) } else { t := time.Unix(stat.EndTime/1000, 0) if t.Sub(stats.finish) > 0 { stats.finish = t } } } // When the head job is processed reset the backoff since the loads // run in BQ in parallel. if loop { head = "" backoff = BaseBackoff } } fmt.Fprintf(os.Stderr, "%#v\n", stats) } type jobStats struct { // Number of files (sources) loaded. files int64 // Bytes read from source (possibly compressed). bytesIn int64 // Bytes loaded into BigQuery (uncompressed). bytesOut int64 // Rows loaded into BigQuery. rows int64 // Time taken to load source into table. elapsed time.Duration // Start time of the job. start time.Time // End time of the job. finish time.Time } func (s jobStats) GoString() string { return fmt.Sprintf("\n%d files loaded in %v (%v). Size: %.2fGB Rows: %d\n", s.files, s.finish.Sub(s.start), s.elapsed, float64(s.bytesOut)/GB, s.rows) }