# Copyright 2009-2010 Yelp # Copyright 2015-2019 Yelp # Copyright 2020 Affirm, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Audit EMR usage over the past 2 weeks, sorted by cluster name and user. Usage:: mrjob audit-emr-usage > report Options:: -c CONF_PATHS, --conf-path CONF_PATHS Path to alternate mrjob.conf file to read from --no-conf Don't load mrjob.conf even if it's available --ec2-endpoint EC2_ENDPOINT Force mrjob to connect to EC2 on this endpoint (e.g. ec2.us-west-1.amazonaws.com). Default is to infer this from region. --emr-endpoint EMR_ENDPOINT Force mrjob to connect to EMR on this endpoint (e.g. us-west-1.elasticmapreduce.amazonaws.com). Default is to infer this from region. -h, --help show this help message and exit --max-days-ago MAX_DAYS_AGO Max number of days ago to look at jobs. By default, we go back as far as EMR supports (currently about 2 months) -q, --quiet Don't print anything to stderr --region REGION GCE/AWS region to run Dataproc/EMR jobs in. --s3-endpoint S3_ENDPOINT Force mrjob to connect to S3 on this endpoint (e.g. s3 -us-west-1.amazonaws.com). You usually shouldn't set this; by default mrjob will choose the correct endpoint for each S3 bucket based on its location. -v, --verbose print more messages to stderr """ # This just approximates EMR billing rules. For the actual rules, see: # # http://aws.amazon.com/elasticmapreduce/faqs/ from __future__ import print_function import math import logging import re from argparse import ArgumentParser from datetime import datetime from datetime import timedelta from time import sleep from mrjob.aws import _boto3_now from mrjob.aws import _boto3_paginate from mrjob.emr import EMRJobRunner from mrjob.job import MRJob from mrjob.options import _add_basic_args from mrjob.options import _add_runner_args from mrjob.options import _alphabetize_actions from mrjob.options import _filter_by_role from mrjob.pool import _pool_name from mrjob.util import strip_microseconds # match an mrjob job key (used to uniquely identify the job) _JOB_KEY_RE = re.compile(r'^(.*)\.(.*)\.(\d+)\.(\d+)\.(\d+)$') # match an mrjob step name (these are used to name steps in EMR) _STEP_NAME_RE = re.compile( r'^(.*)\.(.*)\.(\d+)\.(\d+)\.(\d+): Step (\d+) of (\d+)$') # wait one second between successive calls to EMR API _DELAY = 1 log = logging.getLogger(__name__) def main(args=None): # parse command-line args arg_parser = _make_arg_parser() options = arg_parser.parse_args(args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) now = _boto3_now() log.info('getting cluster history...') clusters = list(_yield_clusters( max_days_ago=options.max_days_ago, now=now, **_runner_kwargs(options))) log.info('compiling cluster stats...') stats = _clusters_to_stats(clusters, now=now) _print_report(stats, now=now) def _make_arg_parser(): usage = '%(prog)s audit-emr-usage [options]' description = 'Print a giant report on EMR usage.' arg_parser = ArgumentParser(usage=usage, description=description) arg_parser.add_argument( '--max-days-ago', dest='max_days_ago', type=float, default=None, help=('Max number of days ago to look at jobs. By default, we go back' ' as far as EMR supports (currently about 2 months)')) _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')) _alphabetize_actions(arg_parser) return arg_parser def _runner_kwargs(options): kwargs = options.__dict__.copy() for unused_arg in ('quiet', 'verbose', 'max_days_ago'): del kwargs[unused_arg] return kwargs def _clusters_to_stats(clusters, now=None): r"""Aggregate statistics for several clusters into a dictionary. :param clusters: a sequence of dicts with the keys ``cluster``, ``steps``. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a dictionary with many keys, including: * *summaries*: A list of dictionaries; the result of running :py:func:`_cluster_to_full_summary` on each cluster. total usage: * *nih_billed*: total normalized instances hours billed, for all clusters * *nih_used*: total normalized instance hours actually used for bootstrapping and running jobs. * *nih_bbnu*: total usage billed but not used (`nih_billed - nih_used`) further breakdown of total usage: * *bootstrap_nih_used*: total usage for bootstrapping * *end_nih_bbnu*: unused time at the end of clusters * *job_nih_used*: total usage for jobs (`nih_used - bootstrap_nih_used`) * *other_nih_bbnu*: other unused time (`nih_bbnu - end_nih_bbnu`) grouping by various keys: (There is a *_used*, *_billed*, and *_bbnu* version of all stats below) * *date_to_nih_\**: map from a :py:class:`datetime.date` to number of normalized instance hours on that date * *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number of normalized instance hours during the hour starting at that time * *label_to_nih_\**: map from jobs' labels (usually the module name of the job) to normalized instance hours, with ``None`` for non-:py:mod:`mrjob` jobs. This includes usage data for bootstrapping. * *job_step_to_nih_\**: map from jobs' labels and step number to normalized instance hours, using ``(None, None)`` for non-:py:mod:`mrjob` jobs. This does not include bootstrapping. * *job_step_to_nih_\*_no_pool*: Same as *job_step_to_nih_\**, but only including non-pooled clusters. * *owner_to_nih_\**: map from jobs' owners (usually the user who ran them) to normalized instance hours, with ``None`` for non-:py:mod:`mrjob` jobs. This includes usage data for bootstrapping. * *pool_to_nih_\**: Map from pool name to normalized instance hours, with ``None`` for non-pooled jobs and non-:py:mod:`mrjob` jobs. """ s = {} # stats for all clusters s['clusters'] = [_cluster_to_full_summary(cluster, now=now) for cluster in clusters] # from here on out, we only process s['clusters'] # total usage for nih_type in ('nih_billed', 'nih_used', 'nih_bbnu'): s[nih_type] = float(sum( cs[nih_type] for cs in s['clusters'])) # break down by usage/waste s['bootstrap_nih_used'] = float(sum( cs['usage'][0]['nih_used'] for cs in s['clusters'] if cs['usage'])) s['job_nih_used'] = s['nih_used'] - s['bootstrap_nih_used'] s['end_nih_bbnu'] = float(sum( cs['usage'][-1]['nih_bbnu'] for cs in s['clusters'] if cs['usage'])) s['other_nih_bbnu'] = s['nih_bbnu'] - s['end_nih_bbnu'] # stats by date/hour for interval_type in ('date', 'hour'): for nih_type in ('nih_billed', 'nih_used', 'nih_bbnu'): key = '%s_to_%s' % (interval_type, nih_type) start_to_nih = {} for cs in s['clusters']: for u in cs['usage']: for start, nih in u[key].items(): start_to_nih.setdefault(start, 0.0) start_to_nih[start] += nih s[key] = start_to_nih # break out by label (usually script name) and owner (usually current user) for key in ('label', 'owner'): for nih_type in ('nih_used', 'nih_billed', 'nih_bbnu'): key_to_nih = {} for cs in s['clusters']: for u in cs['usage']: key_to_nih.setdefault(u[key], 0.0) key_to_nih[u[key]] += u[nih_type] s['%s_to_%s' % (key, nih_type)] = key_to_nih # break down by job step. separate out un-pooled jobs for nih_type in ('nih_used', 'nih_billed', 'nih_bbnu'): job_step_to_nih = {} job_step_to_nih_no_pool = {} for cs in s['clusters']: for u in cs['usage'][1:]: job_step = (u['label'], u['step_num']) job_step_to_nih.setdefault(job_step, 0.0) job_step_to_nih[job_step] += u[nih_type] if not cs['pool']: job_step_to_nih_no_pool.setdefault(job_step, 0.0) job_step_to_nih_no_pool[job_step] += u[nih_type] s['job_step_to_%s' % nih_type] = job_step_to_nih s['job_step_to_%s_no_pool' % nih_type] = job_step_to_nih_no_pool # break down by pool for nih_type in ('nih_used', 'nih_billed', 'nih_bbnu'): pool_to_nih = {} for cs in s['clusters']: pool_to_nih.setdefault(cs['pool'], 0.0) pool_to_nih[cs['pool']] += cs[nih_type] s['pool_to_%s' % nih_type] = pool_to_nih return s def _cluster_to_full_summary(cluster, now=None): """Convert a cluster to a full summary for use in creating a report, including billing/usage information. :param cluster: a :py:mod:`boto3` cluster data structure :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a dictionary with the keys from :py:func:`cluster_to_basic_summary` plus: * *nih_billed*: total normalized instances hours billed for this cluster * *nih_used*: total normalized instance hours actually used for bootstrapping and running jobs. * *nih_bbnu*: total usage billed but not used (`nih_billed - nih_used`) * *usage*: job-specific usage information, returned by :py:func:`_cluster_to_usage_data`. """ cs = _cluster_to_basic_summary(cluster, now=now) cs['usage'] = _cluster_to_usage_data( cluster, basic_summary=cs, now=now) # add up billing info cs['nih_billed'] = float(sum(u['nih_billed'] for u in cs['usage'])) for nih_type in ('nih_used', 'nih_bbnu'): cs[nih_type] = float(sum(u[nih_type] for u in cs['usage'])) return cs def _cluster_to_basic_summary(cluster, now=None): """Extract fields such as creation time, owner, etc. from the cluster. :param cluster: a :py:mod:`boto3` cluster data structure :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a dictionary with the following keys. These will be ``None`` if the corresponding field in the cluster is unavailable. * *created*: UTC `datetime.datetime` that the cluster was created, or ``None`` * *end*: UTC `datetime.datetime` that the cluster finished, or ``None`` * *id*: cluster ID, or ``None`` (this should never happen) * *label*: The label for the cluster (usually the module name of the :py:class:`~mrjob.job.MRJob` script that started it), or ``None`` for non-:py:mod:`mrjob` clusters. * *name*: cluster name, or ``None`` (this should never happen) * *nih*: number of normalized instance hours cluster *would* use if it ran to the end of the next full hour ( * *num_steps*: Number of steps in the cluster. * *owner*: The owner for the cluster (usually the user that started it), or ``None`` for non-:py:mod:`mrjob` clusters. * *pool*: pool name (e.g. ``'default'``) if the cluster is pooled, otherwise ``None``. * *ran*: How long the cluster ran, or has been running, as a :py:class:`datetime.timedelta`. This will be ``timedelta(0)`` if the cluster hasn't started. * *ready*: UTC `datetime.datetime` that the cluster finished bootstrapping, or ``None`` * *state*: The cluster's state as a string (e.g. ``'RUNNING'``) """ if now is None: now = _boto3_now() bcs = {} # basic cluster summary to fill in bcs['id'] = cluster['Id'] bcs['name'] = cluster['Name'] Status = cluster['Status'] Timeline = Status.get('Timeline', {}) bcs['created'] = Timeline.get('CreationDateTime') bcs['ready'] = Timeline.get('ReadyDateTime') bcs['end'] = Timeline.get('EndDateTime') if bcs['created']: bcs['ran'] = (bcs['end'] or now) - bcs['created'] else: bcs['ran'] = timedelta(0) bcs['state'] = Status.get('State') bcs['num_steps'] = len(cluster['Steps']) bcs['pool'] = _pool_name(cluster) m = _JOB_KEY_RE.match(bcs['name'] or '') if m: bcs['label'], bcs['owner'] = m.group(1), m.group(2) else: bcs['label'], bcs['owner'] = None, None bcs['nih'] = float(cluster.get('NormalizedInstanceHours', 0)) return bcs def _cluster_to_usage_data(cluster, basic_summary=None, now=None): r"""Break billing/usage information for a cluster down by job. :param cluster: a :py:mod:`boto3` cluster data structure :param basic_summary: a basic summary of the cluster, returned by :py:func:`_cluster_to_basic_summary`. If this is ``None``, we'll call :py:func:`_cluster_to_basic_summary` ourselves. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a list of dictionaries containing usage information, one for bootstrapping, and one for each step that ran or is currently running. If the cluster hasn't started yet, return ``[]``. Usage dictionaries have the following keys: * *end*: when the job finished running, or *now* if it's still running. * *end_billing*: the effective end of the job for billing purposes, either when the next job starts, the current time if the job is still running, or the end of the next full hour in the cluster. * *nih_billed*: normalized instances hours billed for this job or bootstrapping step * *nih_used*: normalized instance hours actually used for running the job or bootstrapping * *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`) * *date_to_nih_\**: map from a :py:class:`datetime.date` to number of normalized instance hours billed/used/billed but not used on that date * *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number of normalized instance hours billed/used/billed but not used during the hour starting at that time * *label*: job's label (usually the module name of the job), or for the bootstrapping step, the label of the cluster * *owner*: job's owner (usually the user that started it), or for the bootstrapping step, the owner of the cluster * *start*: when the job or bootstrapping step started, as a :py:class:`datetime.datetime` """ bcs = basic_summary or _cluster_to_basic_summary(cluster) if now is None: now = _boto3_now() if not bcs['created']: return [] # EMR no longer bills by the full hour, but NormalizedInstanceHours # still works that way full_hours = math.ceil(timedelta.total_seconds(bcs['ran']) / 60.0 / 60.0) nih_per_sec = bcs['nih'] / (full_hours * 3600.0) # EMR bills by the full second, and at least one minute per cluster cluster_end_billing = bcs['created'] + max( _round_up_to_next_second(bcs['ran']), timedelta(minutes=1)) intervals = [] # make a fake step for cluster startup and bootstrapping, so we don't # consider that wasted. intervals.append({ 'label': bcs['label'], 'owner': bcs['owner'], 'start': bcs['created'], 'end': bcs['ready'] or bcs['end'] or now, 'step_num': None, }) for step in cluster['Steps']: Status = step['Status'] Timeline = Status.get('Timeline', {}) # we've reached the last step that's actually run if not Timeline.get('StartDateTime'): break step_start = Timeline['StartDateTime'] step_end = Timeline.get('EndDateTime') if step_end is None: # step started running and was cancelled. credit it for 0 usage if bcs['end']: step_end = step_start # step is still running else: step_end = now m = _STEP_NAME_RE.match(step['Name']) if m: step_label = m.group(1) step_owner = m.group(2) step_num = int(m.group(6)) else: step_label, step_owner, step_num = None, None, None intervals.append({ 'label': step_label, 'owner': step_owner, 'start': step_start, 'end': step_end, 'step_num': step_num, }) # fill in end_billing for i in range(len(intervals) - 1): intervals[i]['end_billing'] = intervals[i + 1]['start'] intervals[-1]['end_billing'] = cluster_end_billing # fill normalized usage information for interval in intervals: interval['nih_used'] = ( nih_per_sec * timedelta.total_seconds(interval['end'] - interval['start'])) interval['date_to_nih_used'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_date(interval['start'], interval['end']).items()) interval['hour_to_nih_used'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_hour(interval['start'], interval['end']).items()) interval['nih_billed'] = ( nih_per_sec * timedelta.total_seconds( interval['end_billing'] - interval['start'])) interval['date_to_nih_billed'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_date(interval['start'], interval['end_billing']).items()) interval['hour_to_nih_billed'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_hour(interval['start'], interval['end_billing']).items()) # time billed but not used interval['nih_bbnu'] = interval['nih_billed'] - interval['nih_used'] interval['date_to_nih_bbnu'] = {} for d, nih_billed in interval['date_to_nih_billed'].items(): nih_bbnu = nih_billed - interval['date_to_nih_used'].get(d, 0.0) if nih_bbnu: interval['date_to_nih_bbnu'][d] = nih_bbnu interval['hour_to_nih_bbnu'] = {} for d, nih_billed in interval['hour_to_nih_billed'].items(): nih_bbnu = nih_billed - interval['hour_to_nih_used'].get(d, 0.0) if nih_bbnu: interval['hour_to_nih_bbnu'][d] = nih_bbnu return intervals def _subdivide_interval_by_date(start, end): """Convert a time interval to a map from :py:class:`datetime.date` to the number of seconds within the interval on that date. *start* and *end* are :py:class:`datetime.datetime` objects. """ if start.date() == end.date(): date_to_secs = {start.date(): timedelta.total_seconds(end - start)} else: date_to_secs = {} date_to_secs[start.date()] = timedelta.total_seconds( datetime(start.year, start.month, start.day, tzinfo=start.tzinfo) + timedelta(days=1) - start) date_to_secs[end.date()] = timedelta.total_seconds( end - datetime(end.year, end.month, end.day, tzinfo=end.tzinfo)) # fill in dates in the middle cur_date = start.date() + timedelta(days=1) while cur_date < end.date(): date_to_secs[cur_date] = timedelta.total_seconds(timedelta(days=1)) cur_date += timedelta(days=1) # remove zeros date_to_secs = dict( (d, secs) for d, secs in date_to_secs.items() if secs) return date_to_secs def _subdivide_interval_by_hour(start, end): """Convert a time interval to a map from hours (represented as :py:class:`datetime.datetime` for the start of the hour) to the number of seconds during that hour that are within the interval *start* and *end* are :py:class:`datetime.datetime` objects. """ start_hour = start.replace(minute=0, second=0, microsecond=0) end_hour = end.replace(minute=0, second=0, microsecond=0) if start_hour == end_hour: hour_to_secs = {start_hour: timedelta.total_seconds(end - start)} else: hour_to_secs = {} hour_to_secs[start_hour] = timedelta.total_seconds( start_hour + timedelta(hours=1) - start) hour_to_secs[end_hour] = timedelta.total_seconds(end - end_hour) # fill in dates in the middle cur_hour = start_hour + timedelta(hours=1) while cur_hour < end_hour: hour_to_secs[cur_hour] = timedelta.total_seconds( timedelta(hours=1)) cur_hour += timedelta(hours=1) # remove zeros hour_to_secs = dict( (h, secs) for h, secs in hour_to_secs.items() if secs) return hour_to_secs def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs): """Get relevant cluster information from EMR. :param float max_days_ago: If set, don't fetch clusters created longer than this many days ago. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. :param runner_kwargs: keyword args to pass through to :py:class:`~mrjob.emr.EMRJobRunner` """ if now is None: now = _boto3_now() emr_client = EMRJobRunner(**runner_kwargs).make_emr_client() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - timedelta(days=max_days_ago) # use _DELAY to sleep 1 second after each API call (see #1091). Could # implement some sort of connection wrapper for this if it becomes more # generally useful. list_clusters_kwargs = dict(_delay=_DELAY) if created_after is not None: list_clusters_kwargs['CreatedAfter'] = created_after for cluster_summary in _boto3_paginate( 'Clusters', emr_client, 'list_clusters', **list_clusters_kwargs): cluster_id = cluster_summary['Id'] cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] sleep(_DELAY) cluster['Steps'] = list(reversed(list(_boto3_paginate( 'Steps', emr_client, 'list_steps', ClusterId=cluster_id, _delay=_DELAY)))) yield cluster def _print_report(stats, now=None): """Print final report. :param stats: a dictionary returned by :py:func:`_clusters_to_stats` :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. """ if now is None: now = _boto3_now() s = stats if not s['clusters']: print('No clusters created in the past two months!') return print('Total # of Clusters: %d' % len(s['clusters'])) print() print('* All times are in UTC.') print() print('Min create time: %s' % min(cs['created'] for cs in s['clusters'])) print('Max create time: %s' % max(cs['created'] for cs in s['clusters'])) print(' Current time: %s' % now.replace(microsecond=0)) print() print('* All usage is measured in Normalized Instance Hours, which are') print(' roughly equivalent to running an m1.medium instance for an hour.') print(" Billing is estimated, and may not match Amazon's system exactly.") print() # total compute-unit hours used def with_pct(usage): return (usage, _percent(usage, s['nih_billed'])) print('Total billed: %9.2f %5.1f%%' % with_pct(s['nih_billed'])) print(' Total used: %9.2f %5.1f%%' % with_pct(s['nih_used'])) print(' bootstrap: %9.2f %5.1f%%' % with_pct(s['bootstrap_nih_used'])) print(' jobs: %9.2f %5.1f%%' % with_pct(s['job_nih_used'])) print(' Total waste: %9.2f %5.1f%%' % with_pct(s['nih_bbnu'])) print(' at end: %9.2f %5.1f%%' % with_pct(s['end_nih_bbnu'])) print(' other: %9.2f %5.1f%%' % with_pct(s['other_nih_bbnu'])) print() if s['date_to_nih_billed']: print('Daily statistics:') print() print(' date billed used waste % waste') d = max(s['date_to_nih_billed']) while d >= min(s['date_to_nih_billed']): print(' %10s %9.2f %9.2f %9.2f %5.1f' % ( d, s['date_to_nih_billed'].get(d, 0.0), s['date_to_nih_used'].get(d, 0.0), s['date_to_nih_bbnu'].get(d, 0.0), _percent(s['date_to_nih_bbnu'].get(d, 0.0), s['date_to_nih_billed'].get(d, 0.0)))) d -= timedelta(days=1) print() if s['hour_to_nih_billed']: print('Hourly statistics:') print() print(' hour billed used waste % waste') h = max(s['hour_to_nih_billed']) while h >= min(s['hour_to_nih_billed']): print(' %13s %9.2f %9.2f %9.2f %5.1f' % ( h.strftime('%Y-%m-%d %H'), s['hour_to_nih_billed'].get(h, 0.0), s['hour_to_nih_used'].get(h, 0.0), s['hour_to_nih_bbnu'].get(h, 0.0), _percent(s['hour_to_nih_bbnu'].get(h, 0.0), s['hour_to_nih_billed'].get(h, 0.0)))) h -= timedelta(hours=1) print() print('* clusters are considered to belong to the user and job that') print(' started them or last ran on them.') print() # Top jobs print('Top jobs, by total time used:') for label, nih_used in sorted(s['label_to_nih_used'].items(), key=lambda lb_nih: (-lb_nih[1], lb_nih[0])): print(' %9.2f %s' % (nih_used, label)) print() print('Top jobs, by time billed but not used:') for label, nih_bbnu in sorted( s['label_to_nih_bbnu'].items(), key=lambda lb_nih1: (-lb_nih1[1], lb_nih1[0])): print(' %9.2f %s' % (nih_bbnu, label)) print() # Top users print('Top users, by total time used:') for owner, nih_used in sorted(s['owner_to_nih_used'].items(), key=lambda o_nih: (-o_nih[1], o_nih[0])): print(' %9.2f %s' % (nih_used, owner)) print() print('Top users, by time billed but not used:') for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].items(), key=lambda o_nih2: (-o_nih2[1], o_nih2[0])): print(' %9.2f %s' % (nih_bbnu, owner)) print() # Top job steps print('Top job steps, by total time used (step number first):') for (label, step_num), nih_used in sorted( s['job_step_to_nih_used'].items(), key=lambda k_nih: (-k_nih[1], k_nih[0])): if label: print(' %9.2f %3d %s' % (nih_used, step_num, label)) else: print(' %9.2f (non-mrjob step)' % (nih_used,)) print() print('Top job steps, by total time billed but not used (un-pooled only):') for (label, step_num), nih_bbnu in sorted( s['job_step_to_nih_bbnu_no_pool'].items(), key=lambda k_nih3: (-k_nih3[1], k_nih3[0])): if label: print(' %9.2f %3d %s' % (nih_bbnu, step_num, label)) else: print(' %9.2f (non-mrjob step)' % (nih_bbnu,)) print() # Top pools print('All pools, by total time billed:') for pool, nih_billed in sorted(s['pool_to_nih_billed'].items(), key=lambda p_nih: (-p_nih[1], p_nih[0])): print(' %9.2f %s' % (nih_billed, pool or '(not pooled)')) print() print('All pools, by total time billed but not used:') for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].items(), key=lambda p_nih4: (-p_nih4[1], p_nih4[0])): print(' %9.2f %s' % (nih_bbnu, pool or '(not pooled)')) print() # Top clusters print('All clusters, by total time billed:') top_clusters = sorted(s['clusters'], key=lambda cs: (-cs['nih_billed'], cs['name'])) for cs in top_clusters: print(' %9.2f %-15s %s' % ( cs['nih_billed'], cs['id'], cs['name'])) print() print('All clusters, by time billed but not used:') top_clusters_bbnu = sorted( s['clusters'], key=lambda cs: (-cs['nih_bbnu'], cs['name'])) for cs in top_clusters_bbnu: print(' %9.2f %-15s %s' % ( cs['nih_bbnu'], cs['id'], cs['name'])) print() # Details print('Details for all clusters:') print() print(' id state created steps' ' time ran billed waste user name') all_clusters = sorted(s['clusters'], key=lambda cs: cs['created'], reverse=True) for cs in all_clusters: print(' %-15s %-22s %19s %3d %17s %9.2f %9.2f %8s %s' % ( cs['id'], cs['state'], cs['created'], cs['num_steps'], strip_microseconds(cs['ran']), cs['nih_used'], cs['nih_bbnu'], (cs['owner'] or ''), (cs['label'] or ('not started by mrjob')))) def _percent(x, total, default=0.0): """Return what percentage *x* is of *total*, or *default* if *total* is zero.""" if total: return 100.0 * x / total else: return default def _round_up_to_next_second(td): """Round up to the next second because that's how EMR bills.""" if td.microseconds: return strip_microseconds(td) + timedelta(seconds=1) else: return td if __name__ == '__main__': main()