You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

609 rivejä
18 KiB

  1. /*
  2. *
  3. * Copyright 2019 gRPC authors.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. *
  17. */
  18. // Package xds implements a balancer that communicates with a remote balancer using the Envoy xDS
  19. // protocol.
  20. package xds
  21. import (
  22. "context"
  23. "encoding/json"
  24. "errors"
  25. "fmt"
  26. "reflect"
  27. "sync"
  28. "time"
  29. xdspb "github.com/envoyproxy/go-control-plane/envoy/api/v2"
  30. "github.com/gogo/protobuf/proto"
  31. "google.golang.org/grpc/balancer"
  32. "google.golang.org/grpc/balancer/xds/edsbalancer"
  33. "google.golang.org/grpc/connectivity"
  34. "google.golang.org/grpc/grpclog"
  35. "google.golang.org/grpc/resolver"
  36. )
  37. const (
  38. defaultTimeout = 10 * time.Second
  39. xdsName = "xds"
  40. )
  41. var (
  42. // This field is for testing purpose.
  43. // TODO: if later we make startupTimeout configurable through BuildOptions(maybe?), then we can remove
  44. // this field and configure through BuildOptions instead.
  45. startupTimeout = defaultTimeout
  46. newEDSBalancer = func(cc balancer.ClientConn) edsBalancerInterface {
  47. return edsbalancer.NewXDSBalancer(cc)
  48. }
  49. )
  50. func init() {
  51. balancer.Register(newXDSBalancerBuilder())
  52. }
  53. type xdsBalancerBuilder struct{}
  54. func newXDSBalancerBuilder() balancer.Builder {
  55. return &xdsBalancerBuilder{}
  56. }
  57. func (b *xdsBalancerBuilder) Build(cc balancer.ClientConn, opts balancer.BuildOptions) balancer.Balancer {
  58. ctx, cancel := context.WithCancel(context.Background())
  59. x := &xdsBalancer{
  60. ctx: ctx,
  61. cancel: cancel,
  62. buildOpts: opts,
  63. startupTimeout: startupTimeout,
  64. connStateMgr: &connStateMgr{},
  65. startup: true,
  66. grpcUpdate: make(chan interface{}),
  67. xdsClientUpdate: make(chan interface{}),
  68. timer: createDrainedTimer(), // initialized a timer that won't fire without reset
  69. }
  70. x.cc = &xdsClientConn{
  71. updateState: x.connStateMgr.updateState,
  72. ClientConn: cc,
  73. }
  74. go x.run()
  75. return x
  76. }
  77. func (b *xdsBalancerBuilder) Name() string {
  78. return xdsName
  79. }
  80. // edsBalancerInterface defines the interface that edsBalancer must implement to
  81. // communicate with xdsBalancer.
  82. //
  83. // It's implemented by the real eds balancer and a fake testing eds balancer.
  84. type edsBalancerInterface interface {
  85. // HandleEDSResponse passes the received EDS message from traffic director to eds balancer.
  86. HandleEDSResponse(edsResp *xdspb.ClusterLoadAssignment)
  87. // HandleChildPolicy updates the eds balancer the intra-cluster load balancing policy to use.
  88. HandleChildPolicy(name string, config json.RawMessage)
  89. // HandleSubConnStateChange handles state change for SubConn.
  90. HandleSubConnStateChange(sc balancer.SubConn, state connectivity.State)
  91. // Close closes the eds balancer.
  92. Close()
  93. }
  94. // xdsBalancer manages xdsClient and the actual balancer that does load balancing (either edsBalancer,
  95. // or fallback LB).
  96. type xdsBalancer struct {
  97. cc balancer.ClientConn // *xdsClientConn
  98. buildOpts balancer.BuildOptions
  99. startupTimeout time.Duration
  100. xdsStaleTimeout *time.Duration
  101. connStateMgr *connStateMgr
  102. ctx context.Context
  103. cancel context.CancelFunc
  104. startup bool // startup indicates whether this xdsBalancer is in startup stage.
  105. inFallbackMonitor bool
  106. // xdsBalancer continuously monitor the channels below, and will handle events from them in sync.
  107. grpcUpdate chan interface{}
  108. xdsClientUpdate chan interface{}
  109. timer *time.Timer
  110. noSubConnAlert <-chan struct{}
  111. client *client // may change when passed a different service config
  112. config *xdsConfig // may change when passed a different service config
  113. xdsLB edsBalancerInterface
  114. fallbackLB balancer.Balancer
  115. fallbackInitData *addressUpdate // may change when HandleResolved address is called
  116. }
  117. func (x *xdsBalancer) startNewXDSClient(u *xdsConfig) {
  118. // If the xdsBalancer is in startup stage, then we need to apply the startup timeout for the first
  119. // xdsClient to get a response from the traffic director.
  120. if x.startup {
  121. x.startFallbackMonitoring()
  122. }
  123. // Whenever service config gives a new traffic director name, we need to create an xds client to
  124. // connect to it. However, previous xds client should not be closed until the new one successfully
  125. // connects to the traffic director (i.e. get an ADS response from the traffic director). Therefore,
  126. // we let each new client to be responsible to close its immediate predecessor. In this way,
  127. // xdsBalancer does not to implement complex synchronization to achieve the same purpose.
  128. prevClient := x.client
  129. // haveGotADS is true means, this xdsClient has got ADS response from director in the past, which
  130. // means it can close previous client if it hasn't and it now can send lose contact signal for
  131. // fallback monitoring.
  132. var haveGotADS bool
  133. // set up callbacks for the xds client.
  134. newADS := func(ctx context.Context, resp proto.Message) error {
  135. if !haveGotADS {
  136. if prevClient != nil {
  137. prevClient.close()
  138. }
  139. haveGotADS = true
  140. }
  141. return x.newADSResponse(ctx, resp)
  142. }
  143. loseContact := func(ctx context.Context) {
  144. // loseContact signal is only useful when the current xds client has received ADS response before,
  145. // and has not been closed by later xds client.
  146. if haveGotADS {
  147. select {
  148. case <-ctx.Done():
  149. return
  150. default:
  151. }
  152. x.loseContact(ctx)
  153. }
  154. }
  155. exitCleanup := func() {
  156. // Each xds client is responsible to close its predecessor if there's one. There are two paths
  157. // for a xds client to close its predecessor:
  158. // 1. Once it receives its first ADS response.
  159. // 2. It hasn't received its first ADS response yet, but its own successor has received ADS
  160. // response (which triggers the exit of it). Therefore, it needs to close its predecessor if
  161. // it has one.
  162. // Here the exitCleanup is for the 2nd path.
  163. if !haveGotADS && prevClient != nil {
  164. prevClient.close()
  165. }
  166. }
  167. x.client = newXDSClient(u.BalancerName, x.cc.Target(), u.ChildPolicy == nil, x.buildOpts, newADS, loseContact, exitCleanup)
  168. go x.client.run()
  169. }
  170. // run gets executed in a goroutine once xdsBalancer is created. It monitors updates from grpc,
  171. // xdsClient and load balancer. It synchronizes the operations that happen inside xdsBalancer. It
  172. // exits when xdsBalancer is closed.
  173. func (x *xdsBalancer) run() {
  174. for {
  175. select {
  176. case update := <-x.grpcUpdate:
  177. x.handleGRPCUpdate(update)
  178. case update := <-x.xdsClientUpdate:
  179. x.handleXDSClientUpdate(update)
  180. case <-x.timer.C: // x.timer.C will block if we are not in fallback monitoring stage.
  181. x.switchFallback()
  182. case <-x.noSubConnAlert: // x.noSubConnAlert will block if we are not in fallback monitoring stage.
  183. x.switchFallback()
  184. case <-x.ctx.Done():
  185. if x.client != nil {
  186. x.client.close()
  187. }
  188. if x.xdsLB != nil {
  189. x.xdsLB.Close()
  190. }
  191. if x.fallbackLB != nil {
  192. x.fallbackLB.Close()
  193. }
  194. return
  195. }
  196. }
  197. }
  198. func (x *xdsBalancer) handleGRPCUpdate(update interface{}) {
  199. switch u := update.(type) {
  200. case *addressUpdate:
  201. if x.fallbackLB != nil {
  202. x.fallbackLB.HandleResolvedAddrs(u.addrs, u.err)
  203. }
  204. x.fallbackInitData = u
  205. case *subConnStateUpdate:
  206. if x.xdsLB != nil {
  207. x.xdsLB.HandleSubConnStateChange(u.sc, u.state)
  208. }
  209. if x.fallbackLB != nil {
  210. x.fallbackLB.HandleSubConnStateChange(u.sc, u.state)
  211. }
  212. case *xdsConfig:
  213. if x.config == nil {
  214. // The first time we get config, we just need to start the xdsClient.
  215. x.startNewXDSClient(u)
  216. x.config = u
  217. return
  218. }
  219. // With a different BalancerName, we need to create a new xdsClient.
  220. // If current or previous ChildPolicy is nil, then we also need to recreate a new xdsClient.
  221. // This is because with nil ChildPolicy xdsClient will do CDS request, while non-nil won't.
  222. if u.BalancerName != x.config.BalancerName || (u.ChildPolicy == nil) != (x.config.ChildPolicy == nil) {
  223. x.startNewXDSClient(u)
  224. }
  225. // We will update the xdsLB with the new child policy, if we got a different one and it's not nil.
  226. // The nil case will be handled when the CDS response gets processed, we will update xdsLB at that time.
  227. if !reflect.DeepEqual(u.ChildPolicy, x.config.ChildPolicy) && u.ChildPolicy != nil && x.xdsLB != nil {
  228. x.xdsLB.HandleChildPolicy(u.ChildPolicy.Name, u.ChildPolicy.Config)
  229. }
  230. if !reflect.DeepEqual(u.FallBackPolicy, x.config.FallBackPolicy) && x.fallbackLB != nil {
  231. x.fallbackLB.Close()
  232. x.startFallBackBalancer(u)
  233. }
  234. x.config = u
  235. default:
  236. // unreachable path
  237. panic("wrong update type")
  238. }
  239. }
  240. func (x *xdsBalancer) handleXDSClientUpdate(update interface{}) {
  241. switch u := update.(type) {
  242. case *cdsResp:
  243. select {
  244. case <-u.ctx.Done():
  245. return
  246. default:
  247. }
  248. x.cancelFallbackAndSwitchEDSBalancerIfNecessary()
  249. // TODO: Get the optional xds record stale timeout from OutlierDetection message. If not exist,
  250. // reset to 0.
  251. // x.xdsStaleTimeout = u.OutlierDetection.TO_BE_DEFINED_AND_ADDED
  252. x.xdsLB.HandleChildPolicy(u.resp.LbPolicy.String(), nil)
  253. case *edsResp:
  254. select {
  255. case <-u.ctx.Done():
  256. return
  257. default:
  258. }
  259. x.cancelFallbackAndSwitchEDSBalancerIfNecessary()
  260. x.xdsLB.HandleEDSResponse(u.resp)
  261. case *loseContact:
  262. select {
  263. case <-u.ctx.Done():
  264. return
  265. default:
  266. }
  267. // if we are already doing fallback monitoring, then we ignore new loseContact signal.
  268. if x.inFallbackMonitor {
  269. return
  270. }
  271. x.inFallbackMonitor = true
  272. x.startFallbackMonitoring()
  273. default:
  274. panic("unexpected xds client update type")
  275. }
  276. }
  277. type connStateMgr struct {
  278. mu sync.Mutex
  279. curState connectivity.State
  280. notify chan struct{}
  281. }
  282. func (c *connStateMgr) updateState(s connectivity.State) {
  283. c.mu.Lock()
  284. defer c.mu.Unlock()
  285. c.curState = s
  286. if s != connectivity.Ready && c.notify != nil {
  287. close(c.notify)
  288. c.notify = nil
  289. }
  290. }
  291. func (c *connStateMgr) notifyWhenNotReady() <-chan struct{} {
  292. c.mu.Lock()
  293. defer c.mu.Unlock()
  294. if c.curState != connectivity.Ready {
  295. ch := make(chan struct{})
  296. close(ch)
  297. return ch
  298. }
  299. c.notify = make(chan struct{})
  300. return c.notify
  301. }
  302. // xdsClientConn wraps around the balancer.ClientConn passed in from grpc. The wrapping is to add
  303. // functionality to get notification when no subconn is in READY state.
  304. // TODO: once we have the change that keeps both edsbalancer and fallback balancer alive at the same
  305. // time, we need to make sure to synchronize updates from both entities on the ClientConn.
  306. type xdsClientConn struct {
  307. updateState func(s connectivity.State)
  308. balancer.ClientConn
  309. }
  310. func (w *xdsClientConn) UpdateBalancerState(s connectivity.State, p balancer.Picker) {
  311. w.updateState(s)
  312. w.ClientConn.UpdateBalancerState(s, p)
  313. }
  314. type addressUpdate struct {
  315. addrs []resolver.Address
  316. err error
  317. }
  318. type subConnStateUpdate struct {
  319. sc balancer.SubConn
  320. state connectivity.State
  321. }
  322. func (x *xdsBalancer) HandleSubConnStateChange(sc balancer.SubConn, state connectivity.State) {
  323. update := &subConnStateUpdate{
  324. sc: sc,
  325. state: state,
  326. }
  327. select {
  328. case x.grpcUpdate <- update:
  329. case <-x.ctx.Done():
  330. }
  331. }
  332. func (x *xdsBalancer) HandleResolvedAddrs(addrs []resolver.Address, err error) {
  333. update := &addressUpdate{
  334. addrs: addrs,
  335. err: err,
  336. }
  337. select {
  338. case x.grpcUpdate <- update:
  339. case <-x.ctx.Done():
  340. }
  341. }
  342. // TODO: once the API is merged, check whether we need to change the function name/signature here.
  343. func (x *xdsBalancer) HandleBalancerConfig(config json.RawMessage) error {
  344. var cfg xdsConfig
  345. if err := json.Unmarshal(config, &cfg); err != nil {
  346. return errors.New("unable to unmarshal balancer config into xds config")
  347. }
  348. select {
  349. case x.grpcUpdate <- &cfg:
  350. case <-x.ctx.Done():
  351. }
  352. return nil
  353. }
  354. type cdsResp struct {
  355. ctx context.Context
  356. resp *xdspb.Cluster
  357. }
  358. type edsResp struct {
  359. ctx context.Context
  360. resp *xdspb.ClusterLoadAssignment
  361. }
  362. func (x *xdsBalancer) newADSResponse(ctx context.Context, resp proto.Message) error {
  363. var update interface{}
  364. switch u := resp.(type) {
  365. case *xdspb.Cluster:
  366. if u.GetName() != x.cc.Target() {
  367. return fmt.Errorf("unmatched service name, got %s, want %s", u.GetName(), x.cc.Target())
  368. }
  369. if u.GetType() != xdspb.Cluster_EDS {
  370. return fmt.Errorf("unexpected service discovery type, got %v, want %v", u.GetType(), xdspb.Cluster_EDS)
  371. }
  372. update = &cdsResp{ctx: ctx, resp: u}
  373. case *xdspb.ClusterLoadAssignment:
  374. // nothing to check
  375. update = &edsResp{ctx: ctx, resp: u}
  376. default:
  377. grpclog.Warningf("xdsBalancer: got a response that's neither CDS nor EDS, type = %T", u)
  378. }
  379. select {
  380. case x.xdsClientUpdate <- update:
  381. case <-x.ctx.Done():
  382. case <-ctx.Done():
  383. }
  384. return nil
  385. }
  386. type loseContact struct {
  387. ctx context.Context
  388. }
  389. func (x *xdsBalancer) loseContact(ctx context.Context) {
  390. select {
  391. case x.xdsClientUpdate <- &loseContact{ctx: ctx}:
  392. case <-x.ctx.Done():
  393. case <-ctx.Done():
  394. }
  395. }
  396. func (x *xdsBalancer) switchFallback() {
  397. if x.xdsLB != nil {
  398. x.xdsLB.Close()
  399. x.xdsLB = nil
  400. }
  401. x.startFallBackBalancer(x.config)
  402. x.cancelFallbackMonitoring()
  403. }
  404. // x.cancelFallbackAndSwitchEDSBalancerIfNecessary() will be no-op if we have a working xds client.
  405. // It will cancel fallback monitoring if we are in fallback monitoring stage.
  406. // If there's no running edsBalancer currently, it will create one and initialize it. Also, it will
  407. // shutdown the fallback balancer if there's one running.
  408. func (x *xdsBalancer) cancelFallbackAndSwitchEDSBalancerIfNecessary() {
  409. // xDS update will cancel fallback monitoring if we are in fallback monitoring stage.
  410. x.cancelFallbackMonitoring()
  411. // xDS update will switch balancer back to edsBalancer if we are in fallback.
  412. if x.xdsLB == nil {
  413. if x.fallbackLB != nil {
  414. x.fallbackLB.Close()
  415. x.fallbackLB = nil
  416. }
  417. x.xdsLB = newEDSBalancer(x.cc)
  418. if x.config.ChildPolicy != nil {
  419. x.xdsLB.HandleChildPolicy(x.config.ChildPolicy.Name, x.config.ChildPolicy.Config)
  420. }
  421. }
  422. }
  423. func (x *xdsBalancer) startFallBackBalancer(c *xdsConfig) {
  424. if c.FallBackPolicy == nil {
  425. x.startFallBackBalancer(&xdsConfig{
  426. FallBackPolicy: &loadBalancingConfig{
  427. Name: "round_robin",
  428. },
  429. })
  430. return
  431. }
  432. // builder will always be non-nil, since when parse JSON into xdsConfig, we check whether the specified
  433. // balancer is registered or not.
  434. builder := balancer.Get(c.FallBackPolicy.Name)
  435. x.fallbackLB = builder.Build(x.cc, x.buildOpts)
  436. if x.fallbackInitData != nil {
  437. // TODO: uncomment when HandleBalancerConfig API is merged.
  438. //x.fallbackLB.HandleBalancerConfig(c.FallBackPolicy.Config)
  439. x.fallbackLB.HandleResolvedAddrs(x.fallbackInitData.addrs, x.fallbackInitData.err)
  440. }
  441. }
  442. // There are three ways that could lead to fallback:
  443. // 1. During startup (i.e. the first xds client is just created and attempts to contact the traffic
  444. // director), fallback if it has not received any response from the director within the configured
  445. // timeout.
  446. // 2. After xds client loses contact with the remote, fallback if all connections to the backends are
  447. // lost (i.e. not in state READY).
  448. // 3. After xds client loses contact with the remote, fallback if the stale eds timeout has been
  449. // configured through CDS and is timed out.
  450. func (x *xdsBalancer) startFallbackMonitoring() {
  451. if x.startup {
  452. x.startup = false
  453. x.timer.Reset(x.startupTimeout)
  454. return
  455. }
  456. x.noSubConnAlert = x.connStateMgr.notifyWhenNotReady()
  457. if x.xdsStaleTimeout != nil {
  458. if !x.timer.Stop() {
  459. <-x.timer.C
  460. }
  461. x.timer.Reset(*x.xdsStaleTimeout)
  462. }
  463. }
  464. // There are two cases where fallback monitoring should be canceled:
  465. // 1. xDS client returns a new ADS message.
  466. // 2. fallback has been triggered.
  467. func (x *xdsBalancer) cancelFallbackMonitoring() {
  468. if !x.timer.Stop() {
  469. select {
  470. case <-x.timer.C:
  471. // For cases where some fallback condition happens along with the timeout, but timeout loses
  472. // the race, so we need to drain the x.timer.C. thus we don't trigger fallback again.
  473. default:
  474. // if the timer timeout leads us here, then there's no thing to drain from x.timer.C.
  475. }
  476. }
  477. x.noSubConnAlert = nil
  478. x.inFallbackMonitor = false
  479. }
  480. func (x *xdsBalancer) Close() {
  481. x.cancel()
  482. }
  483. func createDrainedTimer() *time.Timer {
  484. timer := time.NewTimer(0 * time.Millisecond)
  485. // make sure initially the timer channel is blocking until reset.
  486. if !timer.Stop() {
  487. <-timer.C
  488. }
  489. return timer
  490. }
  491. type xdsConfig struct {
  492. BalancerName string
  493. ChildPolicy *loadBalancingConfig
  494. FallBackPolicy *loadBalancingConfig
  495. }
  496. // When unmarshalling json to xdsConfig, we iterate through the childPolicy/fallbackPolicy lists
  497. // and select the first LB policy which has been registered to be stored in the returned xdsConfig.
  498. func (p *xdsConfig) UnmarshalJSON(data []byte) error {
  499. var val map[string]json.RawMessage
  500. if err := json.Unmarshal(data, &val); err != nil {
  501. return err
  502. }
  503. for k, v := range val {
  504. switch k {
  505. case "balancerName":
  506. if err := json.Unmarshal(v, &p.BalancerName); err != nil {
  507. return err
  508. }
  509. case "childPolicy":
  510. var lbcfgs []*loadBalancingConfig
  511. if err := json.Unmarshal(v, &lbcfgs); err != nil {
  512. return err
  513. }
  514. for _, lbcfg := range lbcfgs {
  515. if balancer.Get(lbcfg.Name) != nil {
  516. p.ChildPolicy = lbcfg
  517. break
  518. }
  519. }
  520. case "fallbackPolicy":
  521. var lbcfgs []*loadBalancingConfig
  522. if err := json.Unmarshal(v, &lbcfgs); err != nil {
  523. return err
  524. }
  525. for _, lbcfg := range lbcfgs {
  526. if balancer.Get(lbcfg.Name) != nil {
  527. p.FallBackPolicy = lbcfg
  528. break
  529. }
  530. }
  531. }
  532. }
  533. return nil
  534. }
  535. func (p *xdsConfig) MarshalJSON() ([]byte, error) {
  536. return nil, nil
  537. }
  538. type loadBalancingConfig struct {
  539. Name string
  540. Config json.RawMessage
  541. }
  542. func (l *loadBalancingConfig) MarshalJSON() ([]byte, error) {
  543. return nil, nil
  544. }
  545. func (l *loadBalancingConfig) UnmarshalJSON(data []byte) error {
  546. var cfg map[string]json.RawMessage
  547. if err := json.Unmarshal(data, &cfg); err != nil {
  548. return err
  549. }
  550. for name, config := range cfg {
  551. l.Name = name
  552. l.Config = config
  553. }
  554. return nil
  555. }