You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

553 lines
18 KiB

  1. // Copyright 2018 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // Package proftest contains test helpers for profiler agent integration tests.
  16. // This package is experimental.
  17. // golang.org/x/build/kubernetes/dialer.go imports "context" package (rather
  18. // than "golang.org/x/net/context") and that does not exist in Go 1.6 or
  19. // earlier.
  20. // +build go1.7
  21. package proftest
  22. import (
  23. "archive/zip"
  24. "bytes"
  25. "encoding/json"
  26. "errors"
  27. "fmt"
  28. "io/ioutil"
  29. "log"
  30. "net/http"
  31. "strings"
  32. "time"
  33. "github.com/googleapis/gax-go"
  34. "cloud.google.com/go/storage"
  35. "golang.org/x/build/kubernetes"
  36. k8sapi "golang.org/x/build/kubernetes/api"
  37. "golang.org/x/build/kubernetes/gke"
  38. "golang.org/x/net/context"
  39. cloudbuild "google.golang.org/api/cloudbuild/v1"
  40. compute "google.golang.org/api/compute/v1"
  41. container "google.golang.org/api/container/v1"
  42. "google.golang.org/api/googleapi"
  43. )
  44. const (
  45. monitorWriteScope = "https://www.googleapis.com/auth/monitoring.write"
  46. storageReadScope = "https://www.googleapis.com/auth/devstorage.read_only"
  47. )
  48. // TestRunner has common elements used for testing profiling agents on a range
  49. // of environments.
  50. type TestRunner struct {
  51. Client *http.Client
  52. }
  53. // GCETestRunner supports testing a profiling agent on GCE.
  54. type GCETestRunner struct {
  55. TestRunner
  56. ComputeService *compute.Service
  57. }
  58. // GKETestRunner supports testing a profiling agent on GKE.
  59. type GKETestRunner struct {
  60. TestRunner
  61. ContainerService *container.Service
  62. StorageClient *storage.Client
  63. Dockerfile string
  64. }
  65. // ProfileResponse contains the response produced when querying profile server.
  66. type ProfileResponse struct {
  67. Profile ProfileData `json:"profile"`
  68. NumProfiles int32 `json:"numProfiles"`
  69. Deployments []interface{} `json:"deployments"`
  70. }
  71. // ProfileData has data of a single profile.
  72. type ProfileData struct {
  73. Samples []int32 `json:"samples"`
  74. SampleMetrics interface{} `json:"sampleMetrics"`
  75. DefaultMetricType string `json:"defaultMetricType"`
  76. TreeNodes interface{} `json:"treeNodes"`
  77. Functions functionArray `json:"functions"`
  78. SourceFiles interface{} `json:"sourceFiles"`
  79. }
  80. type functionArray struct {
  81. Name []string `json:"name"`
  82. Sourcefile []int32 `json:"sourceFile"`
  83. }
  84. // InstanceConfig is configuration for starting single GCE instance for
  85. // profiling agent test case.
  86. type InstanceConfig struct {
  87. ProjectID string
  88. Zone string
  89. Name string
  90. StartupScript string
  91. MachineType string
  92. }
  93. // ClusterConfig is configuration for starting single GKE cluster for profiling
  94. // agent test case.
  95. type ClusterConfig struct {
  96. ProjectID string
  97. Zone string
  98. ClusterName string
  99. PodName string
  100. ImageSourceName string
  101. ImageName string
  102. Bucket string
  103. Dockerfile string
  104. }
  105. // HasFunction returns nil if the function is present, or, if the function is
  106. // not present, and error providing more details why the function is not
  107. // present.
  108. func (pr *ProfileResponse) HasFunction(functionName string) error {
  109. if pr.NumProfiles == 0 {
  110. return fmt.Errorf("failed to find function name %s in profile: profile response contains zero profiles: %v", functionName, pr)
  111. }
  112. if len(pr.Deployments) == 0 {
  113. return fmt.Errorf("failed to find function name %s in profile: profile response contains zero deployments: %v", functionName, pr)
  114. }
  115. if len(pr.Profile.Functions.Name) == 0 {
  116. return fmt.Errorf("failed to find function name %s in profile: profile does not have function data", functionName)
  117. }
  118. for _, name := range pr.Profile.Functions.Name {
  119. if strings.Contains(name, functionName) {
  120. return nil
  121. }
  122. }
  123. return fmt.Errorf("failed to find function name %s in profile", functionName)
  124. }
  125. // StartInstance starts a GCE Instance with name, zone, and projectId specified
  126. // by the inst, and which runs the startup script specified in inst.
  127. func (tr *GCETestRunner) StartInstance(ctx context.Context, inst *InstanceConfig) error {
  128. img, err := tr.ComputeService.Images.GetFromFamily("debian-cloud", "debian-9").Context(ctx).Do()
  129. if err != nil {
  130. return err
  131. }
  132. op, err := tr.ComputeService.Instances.Insert(inst.ProjectID, inst.Zone, &compute.Instance{
  133. MachineType: fmt.Sprintf("zones/%s/machineTypes/%s", inst.Zone, inst.MachineType),
  134. Name: inst.Name,
  135. Disks: []*compute.AttachedDisk{{
  136. AutoDelete: true, // delete the disk when the VM is deleted.
  137. Boot: true,
  138. Type: "PERSISTENT",
  139. Mode: "READ_WRITE",
  140. InitializeParams: &compute.AttachedDiskInitializeParams{
  141. SourceImage: img.SelfLink,
  142. DiskType: fmt.Sprintf("https://www.googleapis.com/compute/v1/projects/%s/zones/%s/diskTypes/pd-standard", inst.ProjectID, inst.Zone),
  143. },
  144. }},
  145. NetworkInterfaces: []*compute.NetworkInterface{{
  146. Network: fmt.Sprintf("https://www.googleapis.com/compute/v1/projects/%s/global/networks/default", inst.ProjectID),
  147. AccessConfigs: []*compute.AccessConfig{{
  148. Name: "External NAT",
  149. }},
  150. }},
  151. Metadata: &compute.Metadata{
  152. Items: []*compute.MetadataItems{{
  153. Key: "startup-script",
  154. Value: googleapi.String(inst.StartupScript),
  155. }},
  156. },
  157. ServiceAccounts: []*compute.ServiceAccount{{
  158. Email: "default",
  159. Scopes: []string{
  160. monitorWriteScope,
  161. },
  162. }},
  163. }).Do()
  164. if err != nil {
  165. return fmt.Errorf("failed to create instance: %v", err)
  166. }
  167. // Poll status of the operation to create the instance.
  168. getOpCall := tr.ComputeService.ZoneOperations.Get(inst.ProjectID, inst.Zone, op.Name)
  169. for {
  170. if err := checkOpErrors(op); err != nil {
  171. return fmt.Errorf("failed to create instance: %v", err)
  172. }
  173. if op.Status == "DONE" {
  174. return nil
  175. }
  176. if err := gax.Sleep(ctx, 5*time.Second); err != nil {
  177. return err
  178. }
  179. op, err = getOpCall.Do()
  180. if err != nil {
  181. return fmt.Errorf("failed to get operation: %v", err)
  182. }
  183. }
  184. }
  185. // checkOpErrors returns nil if the operation does not have any errors and an
  186. // error summarizing all errors encountered if the operation has errored.
  187. func checkOpErrors(op *compute.Operation) error {
  188. if op.Error == nil || len(op.Error.Errors) == 0 {
  189. return nil
  190. }
  191. var errs []string
  192. for _, e := range op.Error.Errors {
  193. if e.Message != "" {
  194. errs = append(errs, e.Message)
  195. } else {
  196. errs = append(errs, e.Code)
  197. }
  198. }
  199. return errors.New(strings.Join(errs, ","))
  200. }
  201. // DeleteInstance deletes an instance with project id, name, and zone matched
  202. // by inst.
  203. func (tr *GCETestRunner) DeleteInstance(ctx context.Context, inst *InstanceConfig) error {
  204. if _, err := tr.ComputeService.Instances.Delete(inst.ProjectID, inst.Zone, inst.Name).Context(ctx).Do(); err != nil {
  205. return fmt.Errorf("Instances.Delete(%s) got error: %v", inst.Name, err)
  206. }
  207. return nil
  208. }
  209. // PollForSerialOutput polls serial port 2 of the GCE instance specified by
  210. // inst and returns when the finishString appears in the serial output
  211. // of the instance, or when the context times out.
  212. func (tr *GCETestRunner) PollForSerialOutput(ctx context.Context, inst *InstanceConfig, finishString string) error {
  213. var output string
  214. defer func() {
  215. log.Printf("Serial port output for %s:\n%s", inst.Name, output)
  216. }()
  217. for {
  218. select {
  219. case <-ctx.Done():
  220. return ctx.Err()
  221. case <-time.After(20 * time.Second):
  222. resp, err := tr.ComputeService.Instances.GetSerialPortOutput(inst.ProjectID, inst.Zone, inst.Name).Port(2).Context(ctx).Do()
  223. if err != nil {
  224. // Transient failure.
  225. log.Printf("Transient error getting serial port output from instance %s (will retry): %v", inst.Name, err)
  226. continue
  227. }
  228. if resp.Contents == "" {
  229. log.Printf("Ignoring empty serial port output from instance %s (will retry)", inst.Name)
  230. continue
  231. }
  232. if output = resp.Contents; strings.Contains(output, finishString) {
  233. return nil
  234. }
  235. }
  236. }
  237. }
  238. // QueryProfiles retrieves profiles of a specific type, from a specific time
  239. // range, associated with a particular service and project.
  240. func (tr *TestRunner) QueryProfiles(projectID, service, startTime, endTime, profileType string) (ProfileResponse, error) {
  241. queryURL := fmt.Sprintf("https://cloudprofiler.googleapis.com/v2/projects/%s/profiles:query", projectID)
  242. const queryJSONFmt = `{"endTime": "%s", "profileType": "%s","startTime": "%s", "target": "%s"}`
  243. queryRequest := fmt.Sprintf(queryJSONFmt, endTime, profileType, startTime, service)
  244. resp, err := tr.Client.Post(queryURL, "application/json", strings.NewReader(queryRequest))
  245. if err != nil {
  246. return ProfileResponse{}, fmt.Errorf("failed to query API: %v", err)
  247. }
  248. defer resp.Body.Close()
  249. body, err := ioutil.ReadAll(resp.Body)
  250. if err != nil {
  251. return ProfileResponse{}, fmt.Errorf("failed to read response body: %v", err)
  252. }
  253. var pr ProfileResponse
  254. if err := json.Unmarshal(body, &pr); err != nil {
  255. return ProfileResponse{}, err
  256. }
  257. return pr, nil
  258. }
  259. // createAndPublishDockerImage creates a docker image from source code in a GCS
  260. // bucket and pushes the image to Google Container Registry.
  261. func (tr *GKETestRunner) createAndPublishDockerImage(ctx context.Context, projectID, sourceBucket, sourceObject, ImageName string) error {
  262. cloudbuildService, err := cloudbuild.New(tr.Client)
  263. build := &cloudbuild.Build{
  264. Source: &cloudbuild.Source{
  265. StorageSource: &cloudbuild.StorageSource{
  266. Bucket: sourceBucket,
  267. Object: sourceObject,
  268. },
  269. },
  270. Steps: []*cloudbuild.BuildStep{
  271. {
  272. Name: "gcr.io/cloud-builders/docker",
  273. Args: []string{"build", "-t", ImageName, "."},
  274. },
  275. },
  276. Images: []string{ImageName},
  277. }
  278. op, err := cloudbuildService.Projects.Builds.Create(projectID, build).Context(ctx).Do()
  279. if err != nil {
  280. return fmt.Errorf("failed to create image: %v", err)
  281. }
  282. opID := op.Name
  283. // Wait for creating image.
  284. for {
  285. select {
  286. case <-ctx.Done():
  287. return fmt.Errorf("timed out waiting creating image")
  288. case <-time.After(10 * time.Second):
  289. op, err := cloudbuildService.Operations.Get(opID).Context(ctx).Do()
  290. if err != nil {
  291. log.Printf("Transient error getting operation (will retry): %v", err)
  292. break
  293. }
  294. if op.Done == true {
  295. log.Printf("Published image %s to Google Container Registry.", ImageName)
  296. return nil
  297. }
  298. }
  299. }
  300. }
  301. type imageResponse struct {
  302. Manifest map[string]interface{} `json:"manifest"`
  303. Name string `json:"name"`
  304. Tags []string `json:"tags"`
  305. }
  306. // deleteDockerImage deletes a docker image from Google Container Registry.
  307. func (tr *GKETestRunner) deleteDockerImage(ctx context.Context, ImageName string) []error {
  308. queryImageURL := fmt.Sprintf("https://gcr.io/v2/%s/tags/list", ImageName)
  309. resp, err := tr.Client.Get(queryImageURL)
  310. if err != nil {
  311. return []error{fmt.Errorf("failed to list tags: %v", err)}
  312. }
  313. defer resp.Body.Close()
  314. body, err := ioutil.ReadAll(resp.Body)
  315. if err != nil {
  316. return []error{err}
  317. }
  318. var ir imageResponse
  319. if err := json.Unmarshal(body, &ir); err != nil {
  320. return []error{err}
  321. }
  322. const deleteImageURLFmt = "https://gcr.io/v2/%s/manifests/%s"
  323. var errs []error
  324. for _, tag := range ir.Tags {
  325. if err := deleteDockerImageResource(tr.Client, fmt.Sprintf(deleteImageURLFmt, ImageName, tag)); err != nil {
  326. errs = append(errs, fmt.Errorf("failed to delete tag %s: %v", tag, err))
  327. }
  328. }
  329. for manifest := range ir.Manifest {
  330. if err := deleteDockerImageResource(tr.Client, fmt.Sprintf(deleteImageURLFmt, ImageName, manifest)); err != nil {
  331. errs = append(errs, fmt.Errorf("failed to delete manifest %s: %v", manifest, err))
  332. }
  333. }
  334. return errs
  335. }
  336. func deleteDockerImageResource(client *http.Client, url string) error {
  337. req, err := http.NewRequest("DELETE", url, nil)
  338. if err != nil {
  339. return fmt.Errorf("failed to get request: %v", err)
  340. }
  341. resp, err := client.Do(req)
  342. if err != nil {
  343. return fmt.Errorf("failed to delete resource: %v", err)
  344. }
  345. defer resp.Body.Close()
  346. if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted {
  347. return fmt.Errorf("failed to delete resource: status code = %d", resp.StatusCode)
  348. }
  349. return nil
  350. }
  351. func (tr *GKETestRunner) createCluster(ctx context.Context, client *http.Client, projectID, zone, ClusterName string) error {
  352. request := &container.CreateClusterRequest{Cluster: &container.Cluster{
  353. Name: ClusterName,
  354. InitialNodeCount: 3,
  355. NodeConfig: &container.NodeConfig{
  356. OauthScopes: []string{
  357. storageReadScope,
  358. },
  359. },
  360. }}
  361. op, err := tr.ContainerService.Projects.Zones.Clusters.Create(projectID, zone, request).Context(ctx).Do()
  362. if err != nil {
  363. return fmt.Errorf("failed to create cluster %s: %v", ClusterName, err)
  364. }
  365. opID := op.Name
  366. // Wait for creating cluster.
  367. for {
  368. select {
  369. case <-ctx.Done():
  370. return fmt.Errorf("timed out waiting creating cluster")
  371. case <-time.After(10 * time.Second):
  372. op, err := tr.ContainerService.Projects.Zones.Operations.Get(projectID, zone, opID).Context(ctx).Do()
  373. if err != nil {
  374. log.Printf("Transient error getting operation (will retry): %v", err)
  375. break
  376. }
  377. if op.Status == "DONE" {
  378. log.Printf("Created cluster %s.", ClusterName)
  379. return nil
  380. }
  381. if op.Status == "ABORTING" {
  382. return fmt.Errorf("create cluster operation is aborted")
  383. }
  384. }
  385. }
  386. }
  387. func (tr *GKETestRunner) deployContainer(ctx context.Context, kubernetesClient *kubernetes.Client, podName, ImageName string) error {
  388. // TODO: Pod restart policy defaults to "Always". Previous logs will disappear
  389. // after restarting. Always restart causes the test not be able to see the
  390. // finish signal. Should probably set the restart policy to "OnFailure" when
  391. // we get the GKE workflow working and testable.
  392. pod := &k8sapi.Pod{
  393. ObjectMeta: k8sapi.ObjectMeta{
  394. Name: podName,
  395. },
  396. Spec: k8sapi.PodSpec{
  397. Containers: []k8sapi.Container{
  398. {
  399. Name: "profiler-test",
  400. Image: fmt.Sprintf("gcr.io/%s:latest", ImageName),
  401. },
  402. },
  403. },
  404. }
  405. if _, err := kubernetesClient.RunLongLivedPod(ctx, pod); err != nil {
  406. return fmt.Errorf("failed to run pod %s: %v", podName, err)
  407. }
  408. return nil
  409. }
  410. // PollPodLog polls the log of the kubernetes client and returns when the
  411. // finishString appears in the log, or when the context times out.
  412. func (tr *GKETestRunner) PollPodLog(ctx context.Context, kubernetesClient *kubernetes.Client, podName, finishString string) error {
  413. var output string
  414. defer func() {
  415. log.Printf("Log for pod %s:\n%s", podName, output)
  416. }()
  417. for {
  418. select {
  419. case <-ctx.Done():
  420. return fmt.Errorf("timed out waiting profiling finishing on container")
  421. case <-time.After(20 * time.Second):
  422. var err error
  423. output, err = kubernetesClient.PodLog(ctx, podName)
  424. if err != nil {
  425. // Transient failure.
  426. log.Printf("Transient error getting log (will retry): %v", err)
  427. continue
  428. }
  429. if strings.Contains(output, finishString) {
  430. return nil
  431. }
  432. }
  433. }
  434. }
  435. // DeleteClusterAndImage deletes cluster and images used to create cluster.
  436. func (tr *GKETestRunner) DeleteClusterAndImage(ctx context.Context, cfg *ClusterConfig) []error {
  437. var errs []error
  438. if err := tr.StorageClient.Bucket(cfg.Bucket).Object(cfg.ImageSourceName).Delete(ctx); err != nil {
  439. errs = append(errs, fmt.Errorf("failed to delete storage client: %v", err))
  440. }
  441. for _, err := range tr.deleteDockerImage(ctx, cfg.ImageName) {
  442. errs = append(errs, fmt.Errorf("failed to delete docker image: %v", err))
  443. }
  444. if _, err := tr.ContainerService.Projects.Zones.Clusters.Delete(cfg.ProjectID, cfg.Zone, cfg.ClusterName).Context(ctx).Do(); err != nil {
  445. errs = append(errs, fmt.Errorf("failed to delete cluster %s: %v", cfg.ClusterName, err))
  446. }
  447. return errs
  448. }
  449. // StartAndDeployCluster creates image needed for cluster, then starts and
  450. // deploys to cluster.
  451. func (tr *GKETestRunner) StartAndDeployCluster(ctx context.Context, cfg *ClusterConfig) error {
  452. if err := tr.uploadImageSource(ctx, cfg.Bucket, cfg.ImageSourceName, cfg.Dockerfile); err != nil {
  453. return fmt.Errorf("failed to upload image source: %v", err)
  454. }
  455. createImageCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
  456. defer cancel()
  457. if err := tr.createAndPublishDockerImage(createImageCtx, cfg.ProjectID, cfg.Bucket, cfg.ImageSourceName, fmt.Sprintf("gcr.io/%s", cfg.ImageName)); err != nil {
  458. return fmt.Errorf("failed to create and publish docker image %s: %v", cfg.ImageName, err)
  459. }
  460. kubernetesClient, err := gke.NewClient(ctx, cfg.ClusterName, gke.OptZone(cfg.Zone), gke.OptProject(cfg.ProjectID))
  461. if err != nil {
  462. return fmt.Errorf("failed to create new GKE client: %v", err)
  463. }
  464. deployContainerCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
  465. defer cancel()
  466. if err := tr.deployContainer(deployContainerCtx, kubernetesClient, cfg.PodName, cfg.ImageName); err != nil {
  467. return fmt.Errorf("failed to deploy image %q to pod %q: %v", cfg.PodName, cfg.ImageName, err)
  468. }
  469. return nil
  470. }
  471. // uploadImageSource uploads source code for building docker image to GCS.
  472. func (tr *GKETestRunner) uploadImageSource(ctx context.Context, bucket, objectName, dockerfile string) error {
  473. zipBuf := new(bytes.Buffer)
  474. z := zip.NewWriter(zipBuf)
  475. f, err := z.Create("Dockerfile")
  476. if err != nil {
  477. return err
  478. }
  479. if _, err := f.Write([]byte(dockerfile)); err != nil {
  480. return err
  481. }
  482. if err := z.Close(); err != nil {
  483. return err
  484. }
  485. wc := tr.StorageClient.Bucket(bucket).Object(objectName).NewWriter(ctx)
  486. wc.ContentType = "application/zip"
  487. wc.ACL = []storage.ACLRule{{storage.AllUsers, storage.RoleReader}}
  488. if _, err := wc.Write(zipBuf.Bytes()); err != nil {
  489. return err
  490. }
  491. return wc.Close()
  492. }