import json import logging from urlparse import urlparse log = logging.getLogger(__name__) ## Globals etcd_clusters = { 'eqiad.wmnet': { 'conf1001': 'http://conf1001.eqiad.wmnet:2380', 'conf1002': 'http://conf1002.eqiad.wmnet:2380', 'conf1003': 'http://conf1003.eqiad.wmnet:2380', }, 'codfw.wmnet': { 'conf2001': 'http://conf2001.eqiad.wmnet:2380', 'conf2002': 'http://conf2002.eqiad.wmnet:2380', 'conf2003': 'http://conf2003.eqiad.wmnet:2380', }, } recovery = "/tmp/etcd-recovery" class Generator(object): def __init__(self, cluster_name, master): self.cluster_name = cluster_name self.master = master self.cluster = etcd_clusters[cluster_name] self.current_cluster = "" self.etcd_dir = '/var/lib/etcd/etcd-' + self.cluster_name def fqdn(self, host): return urlparse(self.cluster[host]).netloc.split(':')[0] def ssh(self, host, cmd): return "ssh {} {}".format(self.fqdn(host), cmd) def adv_client_url(self, host): return "https://{}:2379".format(self.fqdn(host)) def listen_client_url(self): return "http://127.0.0.1:2378" def stop_etcd_service(self): for host in self.cluster.keys(): log.info('Stopping etcd on %s', host) print self.ssh(host, 'sudo systemctl stop etcd.service') def launch_temp_etcd(self): """ Copies the latest backup to the recovery dir, starts etcd there """ peer_url = self.cluster[self.master] listen_client_url = self.listen_client_url() adv_client_url = self.adv_client_url(self.master) print self.ssh( self.master, 'sudo cp -ax /srv/backups/etcd/etcd-{0}-backup {1}'.format( self.cluster_name, recovery) ) print self.ssh( self.master, 'sudo chown -R etcd:etcd ' + recovery ) data = { 'name': self.master, 'data_dir': recovery, 'listen': listen_client_url, 'adv': adv_client_url, 'peer': peer_url, 'args': "--force-new-cluster" } etcd_cmd = self._etcd_cmd(data) print self.ssh(self.master, etcd_cmd) def _etcd_cmd(self, data): return """sudo -u etcd etcd --data-dir {data_dir} --name {name} \ -initial-advertise-peer-urls {peer} \ -listen-peer-urls {peer} \ -listen-client-urls {listen} \ -advertise-client-urls {adv} \ {args} """.format(**data) def _curl(self, url, method='GET', req=None): client_url = self.listen_client_url() + '/v2' + url cmd = "curl {} -L -X {}".format(client_url, method) if req is not None: cmd += ' -H "Content-Type: application/json" -d \'{}\''.format( json.dumps(req)) return cmd def _etcdctl(self, cmd, username=None): if username: usr = "--username " + username else: usr = "" etcdctl = "etcdctl {} --endpoint {} {}".format( usr, self.listen_client_url(), cmd ) return etcdctl def change_etcd_peer_url(self): req = {'peerURLs': [etcd_clusters[self.cluster_name][self.master]] } print "member=$({} | grep {} | cut -d\: -f1)".format( self._etcdctl("member list"), self.master) print self._curl('/members/$member', method='PUT', req=req) self.current_cluster += "{}={}".format(self.master, self.cluster[self.master]) def move_temp_dir(self): # kill any etcd running on the server print self.ssh(self.master, 'sudo killall -15 etcd') self.wipe_etcd_dir(self.master) print self.ssh(self.master, 'sudo mv {} {}'.format(recovery, self.etcd_dir)) def wipe_etcd_dir(self, host): cmd = 'sudo rm -rf {}'.format(self.etcd_dir) print self.ssh(host, cmd) def add_to_cluster(self, host): print self.ssh(self.master, self._etcdctl( "member add {} {}".format(host, self.cluster[host]), )) self.current_cluster += ',{}={}'.format(host, self.cluster[host]) def start_etcd(self, host): data = { 'name': host, 'data_dir': self.etcd_dir, 'listen': self.listen_client_url(), 'adv': self.adv_client_url(host), 'peer': self.cluster[host], 'args': "--initial-cluster-state existing " "--initial-cluster {}".format(self.current_cluster) } etcd_cmd = self._etcd_cmd(data) print self.ssh(host, etcd_cmd) def enable_auth(self): print self.ssh(self.master, self._etcdctl("auth enable")) def main(): import sys cluster_name = sys.argv[1] master = sys.argv[2] gen = Generator(cluster_name, master) print """############################# # ETCD Recovery instructions (generated via etcd_recovery) # Cluster : {cluster_name} # Master : {master} # Nodes: {nodes} ############################# """.format(**{'cluster_name': cluster_name, 'master': master, 'nodes': gen.cluster}) print """ ### STEP 1: stop etcd across the cluster. """ gen.stop_etcd_service() print """ ### STEP 2: set up the new master from its backup """ print "# Now launch the temporary etcd master from backup" gen.launch_temp_etcd() print print "# When it works, SSH TO THE MASTER AND launch the following" gen.change_etcd_peer_url() print print "# Now kill the original etcd running in the original shell, and start it from the right position" gen.move_temp_dir() gen.start_etcd(gen.master) print """ ### STEP 3: Add back and start the other nodes """ for host in gen.cluster.keys(): if host == master: continue gen.add_to_cluster(host) gen.wipe_etcd_dir(host) gen.start_etcd(host) #gen.enable_auth() if __name__ == '__main__': main()