Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.
 
 
 

369 rader
9.6 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package main
  5. import (
  6. "container/list"
  7. "encoding/json"
  8. "fmt"
  9. "io/ioutil"
  10. "log"
  11. "math"
  12. "math/rand"
  13. "net/http"
  14. "os"
  15. "strconv"
  16. "strings"
  17. "time"
  18. bigquery "google.golang.org/api/bigquery/v2"
  19. storage "google.golang.org/api/storage/v1"
  20. )
  21. const (
  22. GB = 1 << 30
  23. MaxBackoff = 30000
  24. BaseBackoff = 250
  25. BackoffGrowthFactor = 1.8
  26. BackoffGrowthDamper = 0.25
  27. JobStatusDone = "DONE"
  28. DatasetAlreadyExists = "Already Exists: Dataset"
  29. TableWriteEmptyDisposition = "WRITE_EMPTY"
  30. )
  31. func init() {
  32. scope := fmt.Sprintf("%s %s %s", bigquery.BigqueryScope,
  33. storage.DevstorageReadOnlyScope,
  34. "https://www.googleapis.com/auth/userinfo.profile")
  35. registerDemo("bigquery", scope, bqMain)
  36. }
  37. // This example demonstrates loading objects from Google Cloud Storage into
  38. // BigQuery. Objects are specified by their bucket and a name prefix. Each
  39. // object will be loaded into a new table identified by the object name minus
  40. // any file extension. All tables are added to the specified dataset (one will
  41. // be created if necessary). Currently, tables will not be overwritten and an
  42. // attempt to load an object into a dataset that already contains its table
  43. // will emit an error message indicating the table already exists.
  44. // A schema file must be provided and it will be applied to every object/table.
  45. // Example usage:
  46. // go-api-demo -clientid="my-clientid" -secret="my-secret" bq myProject
  47. // myDataBucket datafile2013070 DataFiles2013
  48. // ./datafile_schema.json 100
  49. //
  50. // This will load all objects (e.g. all data files from July 2013) from
  51. // gs://myDataBucket into a (possibly new) BigQuery dataset named DataFiles2013
  52. // using the schema file provided and allowing up to 100 bad records. Assuming
  53. // each object is named like datafileYYYYMMDD.csv.gz and all of July's files are
  54. // stored in the bucket, 9 tables will be created named like datafile201307DD
  55. // where DD ranges from 01 to 09, inclusive.
  56. // When the program completes, it will emit a results line similar to:
  57. //
  58. // 9 files loaded in 3m58s (18m2.708s). Size: 7.18GB Rows: 7130725
  59. //
  60. // The total elapsed time from the start of first job to the end of the last job
  61. // (effectively wall clock time) is shown. In parenthesis is the aggregate time
  62. // taken to load all tables.
  63. func bqMain(client *http.Client, argv []string) {
  64. if len(argv) != 6 {
  65. fmt.Fprintln(os.Stderr,
  66. "Usage: bq project_id bucket prefix dataset schema max_bad_records")
  67. return
  68. }
  69. var (
  70. project = argv[0]
  71. bucket = argv[1]
  72. objPrefix = argv[2]
  73. datasetId = argv[3]
  74. schemaFile = argv[4]
  75. )
  76. badRecords, err := strconv.ParseInt(argv[5], 10, 64)
  77. if err != nil {
  78. fmt.Fprintln(os.Stderr, err)
  79. return
  80. }
  81. rand.Seed(time.Now().UnixNano())
  82. service, err := storage.New(client)
  83. if err != nil {
  84. log.Fatalf("Unable to create Storage service: %v", err)
  85. }
  86. // Get the list of objects in the bucket matching the specified prefix.
  87. list := service.Objects.List(bucket)
  88. list.Prefix(objPrefix)
  89. objects, err := list.Do()
  90. if err != nil {
  91. fmt.Fprintln(os.Stderr, err)
  92. return
  93. }
  94. // Create the wrapper and insert the (new) dataset.
  95. dataset, err := newBQDataset(client, project, datasetId)
  96. if err != nil {
  97. fmt.Fprintln(os.Stderr, err)
  98. return
  99. }
  100. if err = dataset.insert(true); err != nil {
  101. fmt.Fprintln(os.Stderr, err)
  102. return
  103. }
  104. objectSource := &tableSource{
  105. maxBadRecords: badRecords,
  106. disposition: TableWriteEmptyDisposition,
  107. }
  108. // Load the schema from disk.
  109. f, err := ioutil.ReadFile(schemaFile)
  110. if err != nil {
  111. fmt.Fprintln(os.Stderr, err)
  112. return
  113. }
  114. if err = json.Unmarshal(f, &objectSource.schema); err != nil {
  115. fmt.Fprintln(os.Stderr, err)
  116. return
  117. }
  118. // Assumes all objects have .csv, .csv.gz (or no) extension.
  119. tableIdFromObject := func(name string) string {
  120. return strings.TrimSuffix(strings.TrimSuffix(name, ".gz"), ".csv")
  121. }
  122. // A jobset is way to group a collection of jobs together for monitoring.
  123. // For this example, we just use the name of the bucket and object prefix.
  124. jobset := fmt.Sprintf("%s:%s", bucket, objPrefix)
  125. fmt.Fprintf(os.Stderr, "\nLoading %d objects.\n", len(objects.Items))
  126. // Load each object into a dataset of the same name (minus any extension).
  127. // A successful insert call will inject the job into our queue for monitoring.
  128. for _, o := range objects.Items {
  129. objectSource.id = tableIdFromObject(o.Name)
  130. objectSource.uri = fmt.Sprintf("gs://%s/%s", o.Bucket, o.Name)
  131. if err = dataset.load(jobset, objectSource); err != nil {
  132. fmt.Fprintln(os.Stderr, err)
  133. }
  134. }
  135. dataset.monitor(jobset)
  136. }
  137. // Wraps the BigQuery service and dataset and provides some helper functions.
  138. type bqDataset struct {
  139. project string
  140. id string
  141. bq *bigquery.Service
  142. dataset *bigquery.Dataset
  143. jobsets map[string]*list.List
  144. }
  145. func newBQDataset(client *http.Client, dsProj string, dsId string) (*bqDataset,
  146. error) {
  147. service, err := bigquery.New(client)
  148. if err != nil {
  149. log.Fatalf("Unable to create BigQuery service: %v", err)
  150. }
  151. return &bqDataset{
  152. project: dsProj,
  153. id: dsId,
  154. bq: service,
  155. dataset: &bigquery.Dataset{
  156. DatasetReference: &bigquery.DatasetReference{
  157. DatasetId: dsId,
  158. ProjectId: dsProj,
  159. },
  160. },
  161. jobsets: make(map[string]*list.List),
  162. }, nil
  163. }
  164. func (ds *bqDataset) insert(existsOK bool) error {
  165. call := ds.bq.Datasets.Insert(ds.project, ds.dataset)
  166. _, err := call.Do()
  167. if err != nil && (!existsOK || !strings.Contains(err.Error(),
  168. DatasetAlreadyExists)) {
  169. return err
  170. }
  171. return nil
  172. }
  173. type tableSource struct {
  174. id string
  175. uri string
  176. schema bigquery.TableSchema
  177. maxBadRecords int64
  178. disposition string
  179. }
  180. func (ds *bqDataset) load(jobset string, source *tableSource) error {
  181. job := &bigquery.Job{
  182. Configuration: &bigquery.JobConfiguration{
  183. Load: &bigquery.JobConfigurationLoad{
  184. DestinationTable: &bigquery.TableReference{
  185. DatasetId: ds.dataset.DatasetReference.DatasetId,
  186. ProjectId: ds.project,
  187. TableId: source.id,
  188. },
  189. MaxBadRecords: source.maxBadRecords,
  190. Schema: &source.schema,
  191. SourceUris: []string{source.uri},
  192. WriteDisposition: source.disposition,
  193. },
  194. },
  195. }
  196. call := ds.bq.Jobs.Insert(ds.project, job)
  197. job, err := call.Do()
  198. if err != nil {
  199. return err
  200. }
  201. _, ok := ds.jobsets[jobset]
  202. if !ok {
  203. ds.jobsets[jobset] = list.New()
  204. }
  205. ds.jobsets[jobset].PushBack(job)
  206. return nil
  207. }
  208. func (ds *bqDataset) getJob(id string) (*bigquery.Job, error) {
  209. return ds.bq.Jobs.Get(ds.project, id).Do()
  210. }
  211. func (ds *bqDataset) monitor(jobset string) {
  212. jobq, ok := ds.jobsets[jobset]
  213. if !ok {
  214. return
  215. }
  216. var backoff float64 = BaseBackoff
  217. pause := func(grow bool) {
  218. if grow {
  219. backoff *= BackoffGrowthFactor
  220. backoff -= (backoff * rand.Float64() * BackoffGrowthDamper)
  221. backoff = math.Min(backoff, MaxBackoff)
  222. fmt.Fprintf(os.Stderr, "[%s] Checking remaining %d jobs...\n", jobset,
  223. 1+jobq.Len())
  224. }
  225. time.Sleep(time.Duration(backoff) * time.Millisecond)
  226. }
  227. var stats jobStats
  228. // Track a 'head' pending job in queue for detecting cycling.
  229. head := ""
  230. // Loop until all jobs are done - with either success or error.
  231. for jobq.Len() > 0 {
  232. jel := jobq.Front()
  233. job := jel.Value.(*bigquery.Job)
  234. jobq.Remove(jel)
  235. jid := job.JobReference.JobId
  236. loop := false
  237. // Check and possibly pick a new head job id.
  238. if len(head) == 0 {
  239. head = jid
  240. } else {
  241. if jid == head {
  242. loop = true
  243. }
  244. }
  245. // Retrieve the job's current status.
  246. pause(loop)
  247. j, err := ds.getJob(jid)
  248. if err != nil {
  249. fmt.Fprintln(os.Stderr, err)
  250. // In this case of a transient API error, we want keep the job.
  251. if j == nil {
  252. jobq.PushBack(job)
  253. } else {
  254. // Must reset head tracker if job is discarded.
  255. if loop {
  256. head = ""
  257. backoff = BaseBackoff
  258. }
  259. }
  260. continue
  261. }
  262. // Reassign with the updated job data (from Get).
  263. // We don't use j here as Get might return nil for this value.
  264. job = j
  265. if job.Status.State != JobStatusDone {
  266. jobq.PushBack(job)
  267. continue
  268. }
  269. if res := job.Status.ErrorResult; res != nil {
  270. fmt.Fprintln(os.Stderr, res.Message)
  271. } else {
  272. stat := job.Statistics
  273. lstat := stat.Load
  274. stats.files += 1
  275. stats.bytesIn += lstat.InputFileBytes
  276. stats.bytesOut += lstat.OutputBytes
  277. stats.rows += lstat.OutputRows
  278. stats.elapsed +=
  279. time.Duration(stat.EndTime-stat.StartTime) * time.Millisecond
  280. if stats.start.IsZero() {
  281. stats.start = time.Unix(stat.StartTime/1000, 0)
  282. } else {
  283. t := time.Unix(stat.StartTime/1000, 0)
  284. if stats.start.Sub(t) > 0 {
  285. stats.start = t
  286. }
  287. }
  288. if stats.finish.IsZero() {
  289. stats.finish = time.Unix(stat.EndTime/1000, 0)
  290. } else {
  291. t := time.Unix(stat.EndTime/1000, 0)
  292. if t.Sub(stats.finish) > 0 {
  293. stats.finish = t
  294. }
  295. }
  296. }
  297. // When the head job is processed reset the backoff since the loads
  298. // run in BQ in parallel.
  299. if loop {
  300. head = ""
  301. backoff = BaseBackoff
  302. }
  303. }
  304. fmt.Fprintf(os.Stderr, "%#v\n", stats)
  305. }
  306. type jobStats struct {
  307. // Number of files (sources) loaded.
  308. files int64
  309. // Bytes read from source (possibly compressed).
  310. bytesIn int64
  311. // Bytes loaded into BigQuery (uncompressed).
  312. bytesOut int64
  313. // Rows loaded into BigQuery.
  314. rows int64
  315. // Time taken to load source into table.
  316. elapsed time.Duration
  317. // Start time of the job.
  318. start time.Time
  319. // End time of the job.
  320. finish time.Time
  321. }
  322. func (s jobStats) GoString() string {
  323. return fmt.Sprintf("\n%d files loaded in %v (%v). Size: %.2fGB Rows: %d\n",
  324. s.files, s.finish.Sub(s.start), s.elapsed, float64(s.bytesOut)/GB,
  325. s.rows)
  326. }