You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

505 lines
15 KiB

  1. // Copyright 2017 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Package profiler is a client for the Stackdriver Profiler service.
  15. //
  16. // This package is still experimental and subject to change.
  17. //
  18. // Usage example:
  19. //
  20. // import "cloud.google.com/go/profiler"
  21. // ...
  22. // if err := profiler.Start(profiler.Config{Service: "my-service"}); err != nil {
  23. // // TODO: Handle error.
  24. // }
  25. //
  26. // Calling Start will start a goroutine to collect profiles and upload to
  27. // the profiler server, at the rhythm specified by the server.
  28. //
  29. // The caller must provide the service string in the config, and may provide
  30. // other information as well. See Config for details.
  31. //
  32. // Profiler has CPU, heap and goroutine profiling enabled by default. Mutex
  33. // profiling can be enabled in the config. Note that goroutine and mutex
  34. // profiles are shown as "threads" and "contention" profiles in the profiler
  35. // UI.
  36. package profiler
  37. import (
  38. "bytes"
  39. "errors"
  40. "fmt"
  41. "log"
  42. "os"
  43. "runtime"
  44. "runtime/pprof"
  45. "sync"
  46. "time"
  47. gcemd "cloud.google.com/go/compute/metadata"
  48. "cloud.google.com/go/internal/version"
  49. "github.com/golang/protobuf/proto"
  50. "github.com/golang/protobuf/ptypes"
  51. "github.com/google/pprof/profile"
  52. gax "github.com/googleapis/gax-go"
  53. "golang.org/x/net/context"
  54. "google.golang.org/api/option"
  55. gtransport "google.golang.org/api/transport/grpc"
  56. pb "google.golang.org/genproto/googleapis/devtools/cloudprofiler/v2"
  57. edpb "google.golang.org/genproto/googleapis/rpc/errdetails"
  58. "google.golang.org/grpc"
  59. "google.golang.org/grpc/codes"
  60. grpcmd "google.golang.org/grpc/metadata"
  61. "google.golang.org/grpc/status"
  62. )
  63. var (
  64. config Config
  65. startOnce sync.Once
  66. mutexEnabled bool
  67. // The functions below are stubbed to be overrideable for testing.
  68. getProjectID = gcemd.ProjectID
  69. getInstanceName = gcemd.InstanceName
  70. getZone = gcemd.Zone
  71. startCPUProfile = pprof.StartCPUProfile
  72. stopCPUProfile = pprof.StopCPUProfile
  73. writeHeapProfile = pprof.WriteHeapProfile
  74. sleep = gax.Sleep
  75. dialGRPC = gtransport.Dial
  76. onGCE = gcemd.OnGCE
  77. )
  78. const (
  79. apiAddress = "cloudprofiler.googleapis.com:443"
  80. xGoogAPIMetadata = "x-goog-api-client"
  81. zoneNameLabel = "zone"
  82. versionLabel = "version"
  83. languageLabel = "language"
  84. instanceLabel = "instance"
  85. scope = "https://www.googleapis.com/auth/monitoring.write"
  86. initialBackoff = time.Minute
  87. // Ensure the agent will recover within 1 hour.
  88. maxBackoff = time.Hour
  89. backoffMultiplier = 1.3 // Backoff envelope increases by this factor on each retry.
  90. retryInfoMetadata = "google.rpc.retryinfo-bin"
  91. )
  92. // Config is the profiler configuration.
  93. type Config struct {
  94. // Service must be provided to start the profiler. It specifies the name of
  95. // the service under which the profiled data will be recorded and exposed at
  96. // the Profiler UI for the project. You can specify an arbitrary string, but
  97. // see Deployment.target at
  98. // https://github.com/googleapis/googleapis/blob/master/google/devtools/cloudprofiler/v2/profiler.proto
  99. // for restrictions. If the parameter is not set, the agent will probe
  100. // GAE_SERVICE environment variable which is present in Google App Engine
  101. // environment.
  102. // NOTE: The string should be the same across different replicas of
  103. // your service so that the globally constant profiling rate is
  104. // maintained. Do not put things like PID or unique pod ID in the name.
  105. Service string
  106. // ServiceVersion is an optional field specifying the version of the
  107. // service. It can be an arbitrary string. Profiler profiles
  108. // once per minute for each version of each service in each zone.
  109. // ServiceVersion defaults to GAE_VERSION environment variable if that is
  110. // set, or to empty string otherwise.
  111. ServiceVersion string
  112. // DebugLogging enables detailed debug logging from profiler. It
  113. // defaults to false.
  114. DebugLogging bool
  115. // MutexProfiling enables mutex profiling. It defaults to false.
  116. // Note that mutex profiling is not supported by Go versions older
  117. // than Go 1.8.
  118. MutexProfiling bool
  119. // When true, collecting the heap profiles is disabled.
  120. NoHeapProfiling bool
  121. // When true, collecting the goroutine profiles is disabled.
  122. NoGoroutineProfiling bool
  123. // ProjectID is the Cloud Console project ID to use instead of the one set by
  124. // GOOGLE_CLOUD_PROJECT environment variable or read from the VM metadata
  125. // server.
  126. //
  127. // Set this if you are running the agent in your local environment
  128. // or anywhere else outside of Google Cloud Platform.
  129. ProjectID string
  130. // APIAddr is the HTTP endpoint to use to connect to the profiler
  131. // agent API. Defaults to the production environment, overridable
  132. // for testing.
  133. APIAddr string
  134. instance string
  135. zone string
  136. }
  137. // startError represents the error occurred during the
  138. // initializating and starting of the agent.
  139. var startError error
  140. // Start starts a goroutine to collect and upload profiles. The
  141. // caller must provide the service string in the config. See
  142. // Config for details. Start should only be called once. Any
  143. // additional calls will be ignored.
  144. func Start(cfg Config, options ...option.ClientOption) error {
  145. startOnce.Do(func() {
  146. startError = start(cfg, options...)
  147. })
  148. return startError
  149. }
  150. func start(cfg Config, options ...option.ClientOption) error {
  151. if err := initializeConfig(cfg); err != nil {
  152. debugLog("failed to initialize config: %v", err)
  153. return err
  154. }
  155. if config.MutexProfiling {
  156. if mutexEnabled = enableMutexProfiling(); !mutexEnabled {
  157. return fmt.Errorf("mutex profiling is not supported by %s, requires Go 1.8 or later", runtime.Version())
  158. }
  159. }
  160. ctx := context.Background()
  161. opts := []option.ClientOption{
  162. option.WithEndpoint(config.APIAddr),
  163. option.WithScopes(scope),
  164. }
  165. opts = append(opts, options...)
  166. conn, err := dialGRPC(ctx, opts...)
  167. if err != nil {
  168. debugLog("failed to dial GRPC: %v", err)
  169. return err
  170. }
  171. a := initializeAgent(pb.NewProfilerServiceClient(conn))
  172. go pollProfilerService(withXGoogHeader(ctx), a)
  173. return nil
  174. }
  175. func debugLog(format string, e ...interface{}) {
  176. if config.DebugLogging {
  177. log.Printf(format, e...)
  178. }
  179. }
  180. // agent polls the profiler server for instructions on behalf of a task,
  181. // and collects and uploads profiles as requested.
  182. type agent struct {
  183. client pb.ProfilerServiceClient
  184. deployment *pb.Deployment
  185. profileLabels map[string]string
  186. profileTypes []pb.ProfileType
  187. }
  188. // abortedBackoffDuration retrieves the retry duration from gRPC trailing
  189. // metadata, which is set by the profiler server.
  190. func abortedBackoffDuration(md grpcmd.MD) (time.Duration, error) {
  191. elem := md[retryInfoMetadata]
  192. if len(elem) <= 0 {
  193. return 0, errors.New("no retry info")
  194. }
  195. var retryInfo edpb.RetryInfo
  196. if err := proto.Unmarshal([]byte(elem[0]), &retryInfo); err != nil {
  197. return 0, err
  198. } else if time, err := ptypes.Duration(retryInfo.RetryDelay); err != nil {
  199. return 0, err
  200. } else {
  201. if time < 0 {
  202. return 0, errors.New("negative retry duration")
  203. }
  204. return time, nil
  205. }
  206. }
  207. type retryer struct {
  208. backoff gax.Backoff
  209. md grpcmd.MD
  210. }
  211. func (r *retryer) Retry(err error) (time.Duration, bool) {
  212. st, _ := status.FromError(err)
  213. if st != nil && st.Code() == codes.Aborted {
  214. dur, err := abortedBackoffDuration(r.md)
  215. if err == nil {
  216. return dur, true
  217. }
  218. debugLog("failed to get backoff duration: %v", err)
  219. }
  220. return r.backoff.Pause(), true
  221. }
  222. // createProfile talks to the profiler server to create profile. In
  223. // case of error, the goroutine will sleep and retry. Sleep duration may
  224. // be specified by the server. Otherwise it will be an exponentially
  225. // increasing value, bounded by maxBackoff.
  226. func (a *agent) createProfile(ctx context.Context) *pb.Profile {
  227. req := pb.CreateProfileRequest{
  228. Deployment: a.deployment,
  229. ProfileType: a.profileTypes,
  230. }
  231. var p *pb.Profile
  232. md := grpcmd.New(map[string]string{})
  233. gax.Invoke(ctx, func(ctx context.Context, settings gax.CallSettings) error {
  234. var err error
  235. p, err = a.client.CreateProfile(ctx, &req, grpc.Trailer(&md))
  236. if err != nil {
  237. debugLog("failed to create a profile, will retry: %v", err)
  238. }
  239. return err
  240. }, gax.WithRetry(func() gax.Retryer {
  241. return &retryer{
  242. backoff: gax.Backoff{
  243. Initial: initialBackoff,
  244. Max: maxBackoff,
  245. Multiplier: backoffMultiplier,
  246. },
  247. md: md,
  248. }
  249. }))
  250. debugLog("successfully created profile %v", p.GetProfileType())
  251. return p
  252. }
  253. func (a *agent) profileAndUpload(ctx context.Context, p *pb.Profile) {
  254. var prof bytes.Buffer
  255. pt := p.GetProfileType()
  256. switch pt {
  257. case pb.ProfileType_CPU:
  258. duration, err := ptypes.Duration(p.Duration)
  259. if err != nil {
  260. debugLog("failed to get profile duration: %v", err)
  261. return
  262. }
  263. if err := startCPUProfile(&prof); err != nil {
  264. debugLog("failed to start CPU profile: %v", err)
  265. return
  266. }
  267. sleep(ctx, duration)
  268. stopCPUProfile()
  269. case pb.ProfileType_HEAP:
  270. if err := writeHeapProfile(&prof); err != nil {
  271. debugLog("failed to write heap profile: %v", err)
  272. return
  273. }
  274. case pb.ProfileType_THREADS:
  275. if err := pprof.Lookup("goroutine").WriteTo(&prof, 0); err != nil {
  276. debugLog("failed to create goroutine profile: %v", err)
  277. return
  278. }
  279. case pb.ProfileType_CONTENTION:
  280. duration, err := ptypes.Duration(p.Duration)
  281. if err != nil {
  282. debugLog("failed to get profile duration: %v", err)
  283. return
  284. }
  285. if err := deltaMutexProfile(ctx, duration, &prof); err != nil {
  286. debugLog("failed to create mutex profile: %v", err)
  287. return
  288. }
  289. default:
  290. debugLog("unexpected profile type: %v", pt)
  291. return
  292. }
  293. // Starting Go 1.9 the profiles are symbolized by runtime/pprof.
  294. // TODO(jianqiaoli): Remove the symbolization code when we decide to
  295. // stop supporting Go 1.8.
  296. if !shouldAssumeSymbolized && pt != pb.ProfileType_CONTENTION {
  297. if err := parseAndSymbolize(&prof); err != nil {
  298. debugLog("failed to symbolize profile: %v", err)
  299. }
  300. }
  301. p.ProfileBytes = prof.Bytes()
  302. p.Labels = a.profileLabels
  303. req := pb.UpdateProfileRequest{Profile: p}
  304. // Upload profile, discard profile in case of error.
  305. debugLog("start uploading profile")
  306. if _, err := a.client.UpdateProfile(ctx, &req); err != nil {
  307. debugLog("failed to upload profile: %v", err)
  308. }
  309. }
  310. // deltaMutexProfile writes mutex profile changes over a time period specified
  311. // with 'duration' to 'prof'.
  312. func deltaMutexProfile(ctx context.Context, duration time.Duration, prof *bytes.Buffer) error {
  313. if !mutexEnabled {
  314. return errors.New("mutex profiling is not enabled")
  315. }
  316. p0, err := mutexProfile()
  317. if err != nil {
  318. return err
  319. }
  320. sleep(ctx, duration)
  321. p, err := mutexProfile()
  322. if err != nil {
  323. return err
  324. }
  325. // TODO(jianqiaoli): Remove this check when github.com/google/pprof/issues/242
  326. // is fixed.
  327. if len(p0.Mapping) > 0 {
  328. p0.Scale(-1)
  329. p, err = profile.Merge([]*profile.Profile{p0, p})
  330. if err != nil {
  331. return err
  332. }
  333. }
  334. // The mutex profile is not symbolized by runtime.pprof until
  335. // golang.org/issue/21474 is fixed in go1.10.
  336. symbolize(p)
  337. return p.Write(prof)
  338. }
  339. func mutexProfile() (*profile.Profile, error) {
  340. p := pprof.Lookup("mutex")
  341. if p == nil {
  342. return nil, errors.New("mutex profiling is not supported")
  343. }
  344. var buf bytes.Buffer
  345. if err := p.WriteTo(&buf, 0); err != nil {
  346. return nil, err
  347. }
  348. return profile.Parse(&buf)
  349. }
  350. // withXGoogHeader sets the name and version of the application in
  351. // the `x-goog-api-client` header passed on each request. Intended for
  352. // use by Google-written clients.
  353. func withXGoogHeader(ctx context.Context, keyval ...string) context.Context {
  354. kv := append([]string{"gl-go", version.Go(), "gccl", version.Repo}, keyval...)
  355. kv = append(kv, "gax", gax.Version, "grpc", grpc.Version)
  356. md, _ := grpcmd.FromOutgoingContext(ctx)
  357. md = md.Copy()
  358. md[xGoogAPIMetadata] = []string{gax.XGoogHeader(kv...)}
  359. return grpcmd.NewOutgoingContext(ctx, md)
  360. }
  361. func initializeAgent(c pb.ProfilerServiceClient) *agent {
  362. labels := map[string]string{languageLabel: "go"}
  363. if config.zone != "" {
  364. labels[zoneNameLabel] = config.zone
  365. }
  366. if config.ServiceVersion != "" {
  367. labels[versionLabel] = config.ServiceVersion
  368. }
  369. d := &pb.Deployment{
  370. ProjectId: config.ProjectID,
  371. Target: config.Service,
  372. Labels: labels,
  373. }
  374. profileLabels := map[string]string{}
  375. if config.instance != "" {
  376. profileLabels[instanceLabel] = config.instance
  377. }
  378. profileTypes := []pb.ProfileType{pb.ProfileType_CPU}
  379. if !config.NoHeapProfiling {
  380. profileTypes = append(profileTypes, pb.ProfileType_HEAP)
  381. }
  382. if !config.NoGoroutineProfiling {
  383. profileTypes = append(profileTypes, pb.ProfileType_THREADS)
  384. }
  385. if mutexEnabled {
  386. profileTypes = append(profileTypes, pb.ProfileType_CONTENTION)
  387. }
  388. return &agent{
  389. client: c,
  390. deployment: d,
  391. profileLabels: profileLabels,
  392. profileTypes: profileTypes,
  393. }
  394. }
  395. func initializeConfig(cfg Config) error {
  396. config = cfg
  397. if config.Service == "" {
  398. config.Service = os.Getenv("GAE_SERVICE")
  399. }
  400. if config.Service == "" {
  401. return errors.New("service name must be configured")
  402. }
  403. if config.ServiceVersion == "" {
  404. config.ServiceVersion = os.Getenv("GAE_VERSION")
  405. }
  406. if projectID := os.Getenv("GOOGLE_CLOUD_PROJECT"); config.ProjectID == "" && projectID != "" {
  407. // Cloud Shell and App Engine set this environment variable to the project
  408. // ID, so use it if present. In case of App Engine the project ID is also
  409. // available from the GCE metadata server, but by using the environment
  410. // variable saves one request to the metadata server. The environment
  411. // project ID is only used if no project ID is provided in the
  412. // configuration.
  413. config.ProjectID = projectID
  414. }
  415. if onGCE() {
  416. var err error
  417. if config.ProjectID == "" {
  418. if config.ProjectID, err = getProjectID(); err != nil {
  419. return fmt.Errorf("failed to get the project ID from Compute Engine: %v", err)
  420. }
  421. }
  422. if config.zone, err = getZone(); err != nil {
  423. return fmt.Errorf("failed to get zone from Compute Engine: %v", err)
  424. }
  425. if config.instance, err = getInstanceName(); err != nil {
  426. return fmt.Errorf("failed to get instance from Compute Engine: %v", err)
  427. }
  428. } else {
  429. if config.ProjectID == "" {
  430. return fmt.Errorf("project ID must be specified in the configuration if running outside of GCP")
  431. }
  432. }
  433. if config.APIAddr == "" {
  434. config.APIAddr = apiAddress
  435. }
  436. return nil
  437. }
  438. // pollProfilerService starts an endless loop to poll the profiler
  439. // server for instructions, and collects and uploads profiles as
  440. // requested.
  441. func pollProfilerService(ctx context.Context, a *agent) {
  442. debugLog("profiler has started")
  443. for {
  444. p := a.createProfile(ctx)
  445. a.profileAndUpload(ctx, p)
  446. }
  447. }