root@puppetmaster1001:~$ puppet-merge Fetching new commits from: https://gerrit.wikimedia.org/r/labs/private No changes to merge. Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet diff --git a/hieradata/hosts/db2141.yaml b/hieradata/hosts/db2141.yaml new file mode 100644 index 0000000000..f3eed881d9 --- /dev/null +++ b/hieradata/hosts/db2141.yaml @@ -0,0 +1,6 @@ +# db2141 +# Buffer pool sizes/instance enabled +profile::mariadb::dbstore_multiinstance::num_instances: 2 +profile::mariadb::dbstore_multiinstance::s1: '192G' +profile::mariadb::dbstore_multiinstance::s6: '192G' +profile::base::notifications: disabled diff --git a/manifests/site.pp b/manifests/site.pp index 2112a04c4d..71c593b362 100644 --- a/manifests/site.pp +++ b/manifests/site.pp @@ -472,11 +472,6 @@ node 'db2140.codfw.wmnet' { role(mariadb::core) } -# codfw MySQL source backups expansion T260819 -node 'db2141.codfw.wmnet' { - role(insetup) -} - # codfw replicas # See also db2137 and db2138 below node /^db2(073|106|110|119|136)\.codfw\.wmnet/ { @@ -726,25 +721,35 @@ node 'db1145.eqiad.wmnet' { # codfw backup sources - +## s1 & s6, stretch node 'db2097.codfw.wmnet' { role(mariadb::dbstore_multiinstance) } +## s2 & s3, stretch node 'db2098.codfw.wmnet' { role(mariadb::dbstore_multiinstance) } +## s4 & s5, stretch node 'db2099.codfw.wmnet' { role(mariadb::dbstore_multiinstance) } +## s7 & s8, stretch node 'db2100.codfw.wmnet' { role(mariadb::dbstore_multiinstance) } +## x1, buster node 'db2101.codfw.wmnet' { role(mariadb::dbstore_multiinstance) } +## s4 & s5, buster node 'db2139.codfw.wmnet' { role(mariadb::dbstore_multiinstance) } +## s1 & s6, buster +node 'db2141.codfw.wmnet' { + role(mariadb::dbstore_multiinstance) +} + # backup testing hosts node 'db1133.eqiad.wmnet' { diff --git a/modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py b/modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py index f4ed730bd4..8a2912b564 100644 --- a/modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py +++ b/modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py @@ -15,7 +15,7 @@ log = logging.getLogger(__name__) def collect_stats_from_romc_smi(registry, rocm_smi_path): out = subprocess.run([ rocm_smi_path, "--showuse", "--showpower", - "--showtemp", "--showfan", "--json" + "--showtemp", "--showfan", "--showmeminfo", "all", "--json" ], capture_output=True, text=True) rocm_metrics = {} for line in out.stdout.splitlines(): @@ -47,6 +47,14 @@ def collect_stats_from_romc_smi(registry, rocm_smi_path): namespace='amd_rocm_gpu', registry=registry ) + gpu_stats['memory_total'] = Gauge( + 'memory_total_bytes', 'Total GPU memory (bytes)', ['card', 'memtype'], + namespace='amd_rocm_gpu', registry=registry + ) + gpu_stats['memory_used'] = Gauge( + 'memory_used_bytes', 'Used GPU memory (bytes)', ['card', 'memtype'], + namespace='amd_rocm_gpu', registry=registry + ) for card in rocm_metrics: for metric in rocm_metrics[card]: # General usage @@ -86,6 +94,30 @@ def collect_stats_from_romc_smi(registry, rocm_smi_path): elif metric == 'Fan Speed (level)': # we care only about the percentage value continue + + # Memory + # Total memory amounts, for percentage calculation with used memory + elif metric == 'vram Total Memory (B)': + gpu_stats['memory_total'].labels(card=card, memtype='vram').set( + rocm_metrics[card][metric].strip()) + elif metric == 'gtt Total Memory (B)': + gpu_stats['memory_total'].labels(card=card, memtype='gtt').set( + rocm_metrics[card][metric].strip()) + elif metric == 'vis_vram Total Memory (B)': + gpu_stats['memory_total'].labels(card=card, memtype='vis').set( + rocm_metrics[card][metric].strip()) + # Used memory amounts + elif metric == 'vram Total Used Memory (B)': + gpu_stats['memory_used'].labels(card=card, memtype='vram').set( + rocm_metrics[card][metric].strip()) + elif metric == 'gtt Total Used Memory (B)': + gpu_stats['memory_used'].labels(card=card, memtype='gtt').set( + rocm_metrics[card][metric].strip()) + elif metric == 'vis_vram Total Used Memory (B)': + gpu_stats['memory_used'].labels(card=card, memtype='vis').set( + rocm_metrics[card][metric].strip()) + + # Unknown stuff should emit a warning (to be delivered by cron mail) else: log.warning( "Metric {} listed in rocm-smi's JSON but not parsed" Jcrespo: mariadb-backups: Add db2141 to the dbstore role for backup source (3330487795) Tobias Klausmann: prometheus: Add more stats to AMD ROCm GPU exporter (3c649a46c5) WARNING: Revision range includes commits from multiple committers! Merge these changes? (multiple/no)? multiple HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1 Updating f9fb51703b..3330487795 Fast-forward hieradata/hosts/db2141.yaml | 6 ++++++ manifests/site.pp | 17 +++++++++++------ modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 hieradata/hosts/db2141.yaml Running git clean to clean any untracked files. All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097 No LABS changes to merge ===> Starting run on puppetmaster1002.eqiad.wmnet... Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1 Updating f9fb51703b..3330487795 Fast-forward hieradata/hosts/db2141.yaml | 6 ++++++ manifests/site.pp | 17 +++++++++++------ modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 hieradata/hosts/db2141.yaml Running git clean to clean any untracked files. All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097 Connection to puppetmaster1002.eqiad.wmnet closed. OK: puppet-merge on puppetmaster1002.eqiad.wmnet (ops) succeeded ===> Starting run on puppetmaster1003.eqiad.wmnet... Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1 Updating f9fb51703b..3330487795 Fast-forward hieradata/hosts/db2141.yaml | 6 ++++++ manifests/site.pp | 17 +++++++++++------ modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 hieradata/hosts/db2141.yaml Running git clean to clean any untracked files. All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097 Connection to puppetmaster1003.eqiad.wmnet closed. OK: puppet-merge on puppetmaster1003.eqiad.wmnet (ops) succeeded ===> Starting run on puppetmaster2001.codfw.wmnet... Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1 Updating f9fb51703b..3330487795 Fast-forward hieradata/hosts/db2141.yaml | 6 ++++++ manifests/site.pp | 17 +++++++++++------ modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 hieradata/hosts/db2141.yaml Running git clean to clean any untracked files. All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097 Connection to puppetmaster2001.codfw.wmnet closed. OK: puppet-merge on puppetmaster2001.codfw.wmnet (ops) succeeded ===> Starting run on puppetmaster2002.codfw.wmnet... Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1 Updating f9fb51703b..3330487795 Fast-forward hieradata/hosts/db2141.yaml | 6 ++++++ manifests/site.pp | 17 +++++++++++------ modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 hieradata/hosts/db2141.yaml Running git clean to clean any untracked files. All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097 Connection to puppetmaster2002.codfw.wmnet closed. OK: puppet-merge on puppetmaster2002.codfw.wmnet (ops) succeeded ===> Starting run on puppetmaster2003.codfw.wmnet... Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1 Updating f9fb51703b..3330487795 Fast-forward hieradata/hosts/db2141.yaml | 6 ++++++ manifests/site.pp | 17 +++++++++++------ modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 hieradata/hosts/db2141.yaml Running git clean to clean any untracked files. All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097 Connection to puppetmaster2003.codfw.wmnet closed. OK: puppet-merge on puppetmaster2003.codfw.wmnet (ops) succeeded Now running conftool-merge to sync any changes to conftool data Running conftool-sync on /etc/conftool/data 2020-09-10 14:21:07 [INFO] conftool::load_files: Loading data for entity node from /etc/conftool/data 2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/node/codfw.yaml 2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/node/eqsin.yaml 2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/node/eqiad.yaml 2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/node/esams.yaml 2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/node/ulsfo.yaml 2020-09-10 14:21:07 [INFO] conftool::load_files: Loading data for entity discovery from /etc/conftool/data 2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/discovery/services.yaml 2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/discovery/mediawiki.yaml 2020-09-10 14:21:07 [INFO] conftool::load_files: Loading data for entity mwconfig from /etc/conftool/data 2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/mwconfig/data.yaml 2020-09-10 14:21:07 [INFO] conftool::load_files: Loading data for entity dbconfig-instance from /etc/conftool/data 2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/dbconfig-instance/instances.yaml 2020-09-10 14:21:07 [INFO] conftool::load_files: Loading data for entity dbconfig-section from /etc/conftool/data 2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/dbconfig-section/sections.yaml 2020-09-10 14:21:07 [INFO] conftool::load: Adding objects for node 2020-09-10 14:21:07 [INFO] conftool::load: Adding objects for discovery 2020-09-10 14:21:07 [INFO] conftool::load: Adding objects for mwconfig 2020-09-10 14:21:07 [INFO] conftool::load: Adding objects for dbconfig-instance 2020-09-10 14:21:07 [INFO] conftool::load: Adding objects for dbconfig-section 2020-09-10 14:21:07 [INFO] conftool::load: Removing stale objects for dbconfig-section 2020-09-10 14:21:07 [INFO] conftool::load: Removing stale objects for dbconfig-instance 2020-09-10 14:21:07 [INFO] conftool::load: Removing stale objects for mwconfig 2020-09-10 14:21:07 [INFO] conftool::load: Removing stale objects for discovery 2020-09-10 14:21:07 [INFO] conftool::load: Removing stale objects for node Done.