You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

553 lines
17 KiB

  1. // Copyright 2017 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Package profiler is a client for the Stackdriver Profiler service.
  15. //
  16. // This package is still experimental and subject to change.
  17. //
  18. // Usage example:
  19. //
  20. // import "cloud.google.com/go/profiler"
  21. // ...
  22. // if err := profiler.Start(profiler.Config{Service: "my-service"}); err != nil {
  23. // // TODO: Handle error.
  24. // }
  25. //
  26. // Calling Start will start a goroutine to collect profiles and upload to
  27. // the profiler server, at the rhythm specified by the server.
  28. //
  29. // The caller must provide the service string in the config, and may provide
  30. // other information as well. See Config for details.
  31. //
  32. // Profiler has CPU, heap and goroutine profiling enabled by default. Mutex
  33. // profiling can be enabled in the config. Note that goroutine and mutex
  34. // profiles are shown as "threads" and "contention" profiles in the profiler
  35. // UI.
  36. package profiler
  37. import (
  38. "bytes"
  39. "context"
  40. "errors"
  41. "fmt"
  42. "log"
  43. "os"
  44. "regexp"
  45. "runtime"
  46. "runtime/pprof"
  47. "sync"
  48. "time"
  49. gcemd "cloud.google.com/go/compute/metadata"
  50. "cloud.google.com/go/internal/version"
  51. "github.com/golang/protobuf/proto"
  52. "github.com/golang/protobuf/ptypes"
  53. "github.com/google/pprof/profile"
  54. gax "github.com/googleapis/gax-go/v2"
  55. "google.golang.org/api/option"
  56. gtransport "google.golang.org/api/transport/grpc"
  57. pb "google.golang.org/genproto/googleapis/devtools/cloudprofiler/v2"
  58. edpb "google.golang.org/genproto/googleapis/rpc/errdetails"
  59. "google.golang.org/grpc"
  60. "google.golang.org/grpc/codes"
  61. grpcmd "google.golang.org/grpc/metadata"
  62. "google.golang.org/grpc/status"
  63. )
  64. var (
  65. config Config
  66. startOnce sync.Once
  67. mutexEnabled bool
  68. // The functions below are stubbed to be overrideable for testing.
  69. getProjectID = gcemd.ProjectID
  70. getInstanceName = gcemd.InstanceName
  71. getZone = gcemd.Zone
  72. startCPUProfile = pprof.StartCPUProfile
  73. stopCPUProfile = pprof.StopCPUProfile
  74. writeHeapProfile = pprof.WriteHeapProfile
  75. sleep = gax.Sleep
  76. dialGRPC = gtransport.Dial
  77. onGCE = gcemd.OnGCE
  78. serviceRegexp = regexp.MustCompile(`^[a-z]([-a-z0-9_.]{0,253}[a-z0-9])?$`)
  79. )
  80. const (
  81. apiAddress = "cloudprofiler.googleapis.com:443"
  82. xGoogAPIMetadata = "x-goog-api-client"
  83. zoneNameLabel = "zone"
  84. versionLabel = "version"
  85. languageLabel = "language"
  86. instanceLabel = "instance"
  87. scope = "https://www.googleapis.com/auth/monitoring.write"
  88. initialBackoff = time.Minute
  89. // Ensure the agent will recover within 1 hour.
  90. maxBackoff = time.Hour
  91. backoffMultiplier = 1.3 // Backoff envelope increases by this factor on each retry.
  92. retryInfoMetadata = "google.rpc.retryinfo-bin"
  93. )
  94. // Config is the profiler configuration.
  95. type Config struct {
  96. // Service must be provided to start the profiler. It specifies the name of
  97. // the service under which the profiled data will be recorded and exposed at
  98. // the Profiler UI for the project. You can specify an arbitrary string, but
  99. // see Deployment.target at
  100. // https://github.com/googleapis/googleapis/blob/master/google/devtools/cloudprofiler/v2/profiler.proto
  101. // for restrictions. If the parameter is not set, the agent will probe
  102. // GAE_SERVICE environment variable which is present in Google App Engine
  103. // environment.
  104. // NOTE: The string should be the same across different replicas of
  105. // your service so that the globally constant profiling rate is
  106. // maintained. Do not put things like PID or unique pod ID in the name.
  107. Service string
  108. // ServiceVersion is an optional field specifying the version of the
  109. // service. It can be an arbitrary string. Profiler profiles
  110. // once per minute for each version of each service in each zone.
  111. // ServiceVersion defaults to GAE_VERSION environment variable if that is
  112. // set, or to empty string otherwise.
  113. ServiceVersion string
  114. // DebugLogging enables detailed debug logging from profiler. It
  115. // defaults to false.
  116. DebugLogging bool
  117. // MutexProfiling enables mutex profiling. It defaults to false.
  118. // Note that mutex profiling is not supported by Go versions older
  119. // than Go 1.8.
  120. MutexProfiling bool
  121. // When true, collecting the allocation profiles is disabled.
  122. NoAllocProfiling bool
  123. // AllocForceGC forces garbage collection before the collection of each heap
  124. // profile collected to produce the allocation profile. This increases the
  125. // accuracy of allocation profiling. It defaults to false.
  126. AllocForceGC bool
  127. // When true, collecting the heap profiles is disabled.
  128. NoHeapProfiling bool
  129. // When true, collecting the goroutine profiles is disabled.
  130. NoGoroutineProfiling bool
  131. // ProjectID is the Cloud Console project ID to use instead of the one set by
  132. // GOOGLE_CLOUD_PROJECT environment variable or read from the VM metadata
  133. // server.
  134. //
  135. // Set this if you are running the agent in your local environment
  136. // or anywhere else outside of Google Cloud Platform.
  137. ProjectID string
  138. // APIAddr is the HTTP endpoint to use to connect to the profiler
  139. // agent API. Defaults to the production environment, overridable
  140. // for testing.
  141. APIAddr string
  142. // Instance is the name of Compute Engine instance the profiler agent runs
  143. // on. This is normally determined from the Compute Engine metadata server
  144. // and doesn't need to be initialized. It needs to be set in rare cases where
  145. // the metadata server is present but is flaky or otherwise misbehave.
  146. Instance string
  147. // Zone is the zone of Compute Engine instance the profiler agent runs
  148. // on. This is normally determined from the Compute Engine metadata server
  149. // and doesn't need to be initialized. It needs to be set in rare cases where
  150. // the metadata server is present but is flaky or otherwise misbehave.
  151. Zone string
  152. }
  153. // startError represents the error occurred during the
  154. // initializating and starting of the agent.
  155. var startError error
  156. // Start starts a goroutine to collect and upload profiles. The
  157. // caller must provide the service string in the config. See
  158. // Config for details. Start should only be called once. Any
  159. // additional calls will be ignored.
  160. func Start(cfg Config, options ...option.ClientOption) error {
  161. startOnce.Do(func() {
  162. startError = start(cfg, options...)
  163. })
  164. return startError
  165. }
  166. func start(cfg Config, options ...option.ClientOption) error {
  167. if err := initializeConfig(cfg); err != nil {
  168. debugLog("failed to initialize config: %v", err)
  169. return err
  170. }
  171. if config.MutexProfiling {
  172. if mutexEnabled = enableMutexProfiling(); !mutexEnabled {
  173. return fmt.Errorf("mutex profiling is not supported by %s, requires Go 1.8 or later", runtime.Version())
  174. }
  175. }
  176. ctx := context.Background()
  177. opts := []option.ClientOption{
  178. option.WithEndpoint(config.APIAddr),
  179. option.WithScopes(scope),
  180. }
  181. opts = append(opts, options...)
  182. conn, err := dialGRPC(ctx, opts...)
  183. if err != nil {
  184. debugLog("failed to dial GRPC: %v", err)
  185. return err
  186. }
  187. a := initializeAgent(pb.NewProfilerServiceClient(conn))
  188. go pollProfilerService(withXGoogHeader(ctx), a)
  189. return nil
  190. }
  191. func debugLog(format string, e ...interface{}) {
  192. if config.DebugLogging {
  193. log.Printf(format, e...)
  194. }
  195. }
  196. // agent polls the profiler server for instructions on behalf of a task,
  197. // and collects and uploads profiles as requested.
  198. type agent struct {
  199. client pb.ProfilerServiceClient
  200. deployment *pb.Deployment
  201. profileLabels map[string]string
  202. profileTypes []pb.ProfileType
  203. }
  204. // abortedBackoffDuration retrieves the retry duration from gRPC trailing
  205. // metadata, which is set by the profiler server.
  206. func abortedBackoffDuration(md grpcmd.MD) (time.Duration, error) {
  207. elem := md[retryInfoMetadata]
  208. if len(elem) <= 0 {
  209. return 0, errors.New("no retry info")
  210. }
  211. var retryInfo edpb.RetryInfo
  212. if err := proto.Unmarshal([]byte(elem[0]), &retryInfo); err != nil {
  213. return 0, err
  214. } else if time, err := ptypes.Duration(retryInfo.RetryDelay); err != nil {
  215. return 0, err
  216. } else {
  217. if time < 0 {
  218. return 0, errors.New("negative retry duration")
  219. }
  220. return time, nil
  221. }
  222. }
  223. type retryer struct {
  224. backoff gax.Backoff
  225. md grpcmd.MD
  226. }
  227. func (r *retryer) Retry(err error) (time.Duration, bool) {
  228. st, _ := status.FromError(err)
  229. if st != nil && st.Code() == codes.Aborted {
  230. dur, err := abortedBackoffDuration(r.md)
  231. if err == nil {
  232. return dur, true
  233. }
  234. debugLog("failed to get backoff duration: %v", err)
  235. }
  236. return r.backoff.Pause(), true
  237. }
  238. // createProfile talks to the profiler server to create profile. In
  239. // case of error, the goroutine will sleep and retry. Sleep duration may
  240. // be specified by the server. Otherwise it will be an exponentially
  241. // increasing value, bounded by maxBackoff.
  242. func (a *agent) createProfile(ctx context.Context) *pb.Profile {
  243. req := pb.CreateProfileRequest{
  244. Parent: "projects/" + a.deployment.ProjectId,
  245. Deployment: a.deployment,
  246. ProfileType: a.profileTypes,
  247. }
  248. var p *pb.Profile
  249. md := grpcmd.New(map[string]string{})
  250. gax.Invoke(ctx, func(ctx context.Context, settings gax.CallSettings) error {
  251. var err error
  252. p, err = a.client.CreateProfile(ctx, &req, grpc.Trailer(&md))
  253. if err != nil {
  254. debugLog("failed to create a profile, will retry: %v", err)
  255. }
  256. return err
  257. }, gax.WithRetry(func() gax.Retryer {
  258. return &retryer{
  259. backoff: gax.Backoff{
  260. Initial: initialBackoff,
  261. Max: maxBackoff,
  262. Multiplier: backoffMultiplier,
  263. },
  264. md: md,
  265. }
  266. }))
  267. debugLog("successfully created profile %v", p.GetProfileType())
  268. return p
  269. }
  270. func (a *agent) profileAndUpload(ctx context.Context, p *pb.Profile) {
  271. var prof bytes.Buffer
  272. pt := p.GetProfileType()
  273. switch pt {
  274. case pb.ProfileType_CPU:
  275. duration, err := ptypes.Duration(p.Duration)
  276. if err != nil {
  277. debugLog("failed to get profile duration for CPU profile: %v", err)
  278. return
  279. }
  280. if err := startCPUProfile(&prof); err != nil {
  281. debugLog("failed to start CPU profile: %v", err)
  282. return
  283. }
  284. sleep(ctx, duration)
  285. stopCPUProfile()
  286. case pb.ProfileType_HEAP:
  287. if err := heapProfile(&prof); err != nil {
  288. debugLog("failed to write heap profile: %v", err)
  289. return
  290. }
  291. case pb.ProfileType_HEAP_ALLOC:
  292. duration, err := ptypes.Duration(p.Duration)
  293. if err != nil {
  294. debugLog("failed to get profile duration for allocation profile: %v", err)
  295. return
  296. }
  297. if err := deltaAllocProfile(ctx, duration, config.AllocForceGC, &prof); err != nil {
  298. debugLog("failed to collect allocation profile: %v", err)
  299. return
  300. }
  301. case pb.ProfileType_THREADS:
  302. if err := pprof.Lookup("goroutine").WriteTo(&prof, 0); err != nil {
  303. debugLog("failed to collect goroutine profile: %v", err)
  304. return
  305. }
  306. case pb.ProfileType_CONTENTION:
  307. duration, err := ptypes.Duration(p.Duration)
  308. if err != nil {
  309. debugLog("failed to get profile duration: %v", err)
  310. return
  311. }
  312. if err := deltaMutexProfile(ctx, duration, &prof); err != nil {
  313. debugLog("failed to collect mutex profile: %v", err)
  314. return
  315. }
  316. default:
  317. debugLog("unexpected profile type: %v", pt)
  318. return
  319. }
  320. // Starting Go 1.9 the profiles are symbolized by runtime/pprof.
  321. // TODO(jianqiaoli): Remove the symbolization code when we decide to
  322. // stop supporting Go 1.8.
  323. if !shouldAssumeSymbolized && pt != pb.ProfileType_CONTENTION {
  324. if err := parseAndSymbolize(&prof); err != nil {
  325. debugLog("failed to symbolize profile: %v", err)
  326. }
  327. }
  328. p.ProfileBytes = prof.Bytes()
  329. p.Labels = a.profileLabels
  330. req := pb.UpdateProfileRequest{Profile: p}
  331. // Upload profile, discard profile in case of error.
  332. debugLog("start uploading profile")
  333. if _, err := a.client.UpdateProfile(ctx, &req); err != nil {
  334. debugLog("failed to upload profile: %v", err)
  335. }
  336. }
  337. // deltaMutexProfile writes mutex profile changes over a time period specified
  338. // with 'duration' to 'prof'.
  339. func deltaMutexProfile(ctx context.Context, duration time.Duration, prof *bytes.Buffer) error {
  340. if !mutexEnabled {
  341. return errors.New("mutex profiling is not enabled")
  342. }
  343. p0, err := mutexProfile()
  344. if err != nil {
  345. return err
  346. }
  347. sleep(ctx, duration)
  348. p, err := mutexProfile()
  349. if err != nil {
  350. return err
  351. }
  352. p0.Scale(-1)
  353. p, err = profile.Merge([]*profile.Profile{p0, p})
  354. if err != nil {
  355. return err
  356. }
  357. // The mutex profile is not symbolized by runtime.pprof until
  358. // golang.org/issue/21474 is fixed in go1.10.
  359. symbolize(p)
  360. return p.Write(prof)
  361. }
  362. func mutexProfile() (*profile.Profile, error) {
  363. p := pprof.Lookup("mutex")
  364. if p == nil {
  365. return nil, errors.New("mutex profiling is not supported")
  366. }
  367. var buf bytes.Buffer
  368. if err := p.WriteTo(&buf, 0); err != nil {
  369. return nil, err
  370. }
  371. return profile.Parse(&buf)
  372. }
  373. // withXGoogHeader sets the name and version of the application in
  374. // the `x-goog-api-client` header passed on each request. Intended for
  375. // use by Google-written clients.
  376. func withXGoogHeader(ctx context.Context, keyval ...string) context.Context {
  377. kv := append([]string{"gl-go", version.Go(), "gccl", version.Repo}, keyval...)
  378. kv = append(kv, "gax", gax.Version, "grpc", grpc.Version)
  379. md, _ := grpcmd.FromOutgoingContext(ctx)
  380. md = md.Copy()
  381. md[xGoogAPIMetadata] = []string{gax.XGoogHeader(kv...)}
  382. return grpcmd.NewOutgoingContext(ctx, md)
  383. }
  384. func initializeAgent(c pb.ProfilerServiceClient) *agent {
  385. labels := map[string]string{languageLabel: "go"}
  386. if config.Zone != "" {
  387. labels[zoneNameLabel] = config.Zone
  388. }
  389. if config.ServiceVersion != "" {
  390. labels[versionLabel] = config.ServiceVersion
  391. }
  392. d := &pb.Deployment{
  393. ProjectId: config.ProjectID,
  394. Target: config.Service,
  395. Labels: labels,
  396. }
  397. profileLabels := map[string]string{}
  398. if config.Instance != "" {
  399. profileLabels[instanceLabel] = config.Instance
  400. }
  401. profileTypes := []pb.ProfileType{pb.ProfileType_CPU}
  402. if !config.NoHeapProfiling {
  403. profileTypes = append(profileTypes, pb.ProfileType_HEAP)
  404. }
  405. if !config.NoGoroutineProfiling {
  406. profileTypes = append(profileTypes, pb.ProfileType_THREADS)
  407. }
  408. if !config.NoAllocProfiling {
  409. profileTypes = append(profileTypes, pb.ProfileType_HEAP_ALLOC)
  410. }
  411. if mutexEnabled {
  412. profileTypes = append(profileTypes, pb.ProfileType_CONTENTION)
  413. }
  414. return &agent{
  415. client: c,
  416. deployment: d,
  417. profileLabels: profileLabels,
  418. profileTypes: profileTypes,
  419. }
  420. }
  421. func initializeConfig(cfg Config) error {
  422. config = cfg
  423. if config.Service == "" {
  424. for _, ev := range []string{"GAE_SERVICE", "K_SERVICE"} {
  425. if val := os.Getenv(ev); val != "" {
  426. config.Service = val
  427. break
  428. }
  429. }
  430. }
  431. if config.Service == "" {
  432. return errors.New("service name must be configured")
  433. }
  434. if !serviceRegexp.MatchString(config.Service) {
  435. return fmt.Errorf("service name %q does not match regular expression %v", config.Service, serviceRegexp)
  436. }
  437. if config.ServiceVersion == "" {
  438. for _, ev := range []string{"GAE_VERSION", "K_REVISION"} {
  439. if val := os.Getenv(ev); val != "" {
  440. config.ServiceVersion = val
  441. break
  442. }
  443. }
  444. }
  445. if projectID := os.Getenv("GOOGLE_CLOUD_PROJECT"); config.ProjectID == "" && projectID != "" {
  446. // Cloud Shell and App Engine set this environment variable to the project
  447. // ID, so use it if present. In case of App Engine the project ID is also
  448. // available from the GCE metadata server, but by using the environment
  449. // variable saves one request to the metadata server. The environment
  450. // project ID is only used if no project ID is provided in the
  451. // configuration.
  452. config.ProjectID = projectID
  453. }
  454. if onGCE() {
  455. var err error
  456. if config.ProjectID == "" {
  457. if config.ProjectID, err = getProjectID(); err != nil {
  458. return fmt.Errorf("failed to get the project ID from Compute Engine metadata: %v", err)
  459. }
  460. }
  461. if config.Zone == "" {
  462. if config.Zone, err = getZone(); err != nil {
  463. return fmt.Errorf("failed to get zone from Compute Engine metadata: %v", err)
  464. }
  465. }
  466. if config.Instance == "" {
  467. if config.Instance, err = getInstanceName(); err != nil {
  468. if _, ok := err.(gcemd.NotDefinedError); !ok {
  469. return fmt.Errorf("failed to get instance name from Compute Engine metadata: %v", err)
  470. }
  471. debugLog("failed to get instance name from Compute Engine metadata, will use empty name: %v", err)
  472. }
  473. }
  474. } else {
  475. if config.ProjectID == "" {
  476. return fmt.Errorf("project ID must be specified in the configuration if running outside of GCP")
  477. }
  478. }
  479. if config.APIAddr == "" {
  480. config.APIAddr = apiAddress
  481. }
  482. return nil
  483. }
  484. // pollProfilerService starts an endless loop to poll the profiler
  485. // server for instructions, and collects and uploads profiles as
  486. // requested.
  487. func pollProfilerService(ctx context.Context, a *agent) {
  488. debugLog("profiler has started")
  489. for {
  490. p := a.createProfile(ctx)
  491. a.profileAndUpload(ctx, p)
  492. }
  493. }