GildedRose-Refactoring-Kata/venv/Lib/site-packages/mrjob/tools/emr/audit_usage.py

821 lines
30 KiB
Python

# Copyright 2009-2010 Yelp
# Copyright 2015-2019 Yelp
# Copyright 2020 Affirm, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Audit EMR usage over the past 2 weeks, sorted by cluster name and user.
Usage::
mrjob audit-emr-usage > report
Options::
-c CONF_PATHS, --conf-path CONF_PATHS
Path to alternate mrjob.conf file to read from
--no-conf Don't load mrjob.conf even if it's available
--ec2-endpoint EC2_ENDPOINT
Force mrjob to connect to EC2 on this endpoint (e.g.
ec2.us-west-1.amazonaws.com). Default is to infer this
from region.
--emr-endpoint EMR_ENDPOINT
Force mrjob to connect to EMR on this endpoint (e.g.
us-west-1.elasticmapreduce.amazonaws.com). Default is
to infer this from region.
-h, --help show this help message and exit
--max-days-ago MAX_DAYS_AGO
Max number of days ago to look at jobs. By default, we
go back as far as EMR supports (currently about 2
months)
-q, --quiet Don't print anything to stderr
--region REGION GCE/AWS region to run Dataproc/EMR jobs in.
--s3-endpoint S3_ENDPOINT
Force mrjob to connect to S3 on this endpoint (e.g. s3
-us-west-1.amazonaws.com). You usually shouldn't set
this; by default mrjob will choose the correct
endpoint for each S3 bucket based on its location.
-v, --verbose print more messages to stderr
"""
# This just approximates EMR billing rules. For the actual rules, see:
#
# http://aws.amazon.com/elasticmapreduce/faqs/
from __future__ import print_function
import math
import logging
import re
from argparse import ArgumentParser
from datetime import datetime
from datetime import timedelta
from time import sleep
from mrjob.aws import _boto3_now
from mrjob.aws import _boto3_paginate
from mrjob.emr import EMRJobRunner
from mrjob.job import MRJob
from mrjob.options import _add_basic_args
from mrjob.options import _add_runner_args
from mrjob.options import _alphabetize_actions
from mrjob.options import _filter_by_role
from mrjob.pool import _pool_name
from mrjob.util import strip_microseconds
# match an mrjob job key (used to uniquely identify the job)
_JOB_KEY_RE = re.compile(r'^(.*)\.(.*)\.(\d+)\.(\d+)\.(\d+)$')
# match an mrjob step name (these are used to name steps in EMR)
_STEP_NAME_RE = re.compile(
r'^(.*)\.(.*)\.(\d+)\.(\d+)\.(\d+): Step (\d+) of (\d+)$')
# wait one second between successive calls to EMR API
_DELAY = 1
log = logging.getLogger(__name__)
def main(args=None):
# parse command-line args
arg_parser = _make_arg_parser()
options = arg_parser.parse_args(args)
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
now = _boto3_now()
log.info('getting cluster history...')
clusters = list(_yield_clusters(
max_days_ago=options.max_days_ago, now=now, **_runner_kwargs(options)))
log.info('compiling cluster stats...')
stats = _clusters_to_stats(clusters, now=now)
_print_report(stats, now=now)
def _make_arg_parser():
usage = '%(prog)s audit-emr-usage [options]'
description = 'Print a giant report on EMR usage.'
arg_parser = ArgumentParser(usage=usage, description=description)
arg_parser.add_argument(
'--max-days-ago', dest='max_days_ago', type=float, default=None,
help=('Max number of days ago to look at jobs. By default, we go back'
' as far as EMR supports (currently about 2 months)'))
_add_basic_args(arg_parser)
_add_runner_args(
arg_parser,
_filter_by_role(EMRJobRunner.OPT_NAMES, 'connect'))
_alphabetize_actions(arg_parser)
return arg_parser
def _runner_kwargs(options):
kwargs = options.__dict__.copy()
for unused_arg in ('quiet', 'verbose', 'max_days_ago'):
del kwargs[unused_arg]
return kwargs
def _clusters_to_stats(clusters, now=None):
r"""Aggregate statistics for several clusters into a dictionary.
:param clusters: a sequence of dicts with the keys ``cluster``, ``steps``.
:param now: the current UTC time, as a :py:class:`datetime.datetime`.
Defaults to the current time.
Returns a dictionary with many keys, including:
* *summaries*: A list of dictionaries; the result of running
:py:func:`_cluster_to_full_summary` on each cluster.
total usage:
* *nih_billed*: total normalized instances hours billed, for all clusters
* *nih_used*: total normalized instance hours actually used for
bootstrapping and running jobs.
* *nih_bbnu*: total usage billed but not used (`nih_billed - nih_used`)
further breakdown of total usage:
* *bootstrap_nih_used*: total usage for bootstrapping
* *end_nih_bbnu*: unused time at the end of clusters
* *job_nih_used*: total usage for jobs (`nih_used - bootstrap_nih_used`)
* *other_nih_bbnu*: other unused time (`nih_bbnu - end_nih_bbnu`)
grouping by various keys:
(There is a *_used*, *_billed*, and *_bbnu* version of all stats below)
* *date_to_nih_\**: map from a :py:class:`datetime.date` to number
of normalized instance hours on that date
* *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number
of normalized instance hours during the hour starting at that time
* *label_to_nih_\**: map from jobs' labels (usually the module name of
the job) to normalized instance hours, with ``None`` for
non-:py:mod:`mrjob` jobs. This includes usage data for bootstrapping.
* *job_step_to_nih_\**: map from jobs' labels and step number to
normalized instance hours, using ``(None, None)`` for non-:py:mod:`mrjob`
jobs. This does not include bootstrapping.
* *job_step_to_nih_\*_no_pool*: Same as *job_step_to_nih_\**, but only
including non-pooled clusters.
* *owner_to_nih_\**: map from jobs' owners (usually the user who ran them)
to normalized instance hours, with ``None`` for non-:py:mod:`mrjob` jobs.
This includes usage data for bootstrapping.
* *pool_to_nih_\**: Map from pool name to normalized instance hours,
with ``None`` for non-pooled jobs and non-:py:mod:`mrjob` jobs.
"""
s = {} # stats for all clusters
s['clusters'] = [_cluster_to_full_summary(cluster, now=now)
for cluster in clusters]
# from here on out, we only process s['clusters']
# total usage
for nih_type in ('nih_billed', 'nih_used', 'nih_bbnu'):
s[nih_type] = float(sum(
cs[nih_type] for cs in s['clusters']))
# break down by usage/waste
s['bootstrap_nih_used'] = float(sum(
cs['usage'][0]['nih_used'] for cs in s['clusters']
if cs['usage']))
s['job_nih_used'] = s['nih_used'] - s['bootstrap_nih_used']
s['end_nih_bbnu'] = float(sum(
cs['usage'][-1]['nih_bbnu'] for cs in s['clusters']
if cs['usage']))
s['other_nih_bbnu'] = s['nih_bbnu'] - s['end_nih_bbnu']
# stats by date/hour
for interval_type in ('date', 'hour'):
for nih_type in ('nih_billed', 'nih_used', 'nih_bbnu'):
key = '%s_to_%s' % (interval_type, nih_type)
start_to_nih = {}
for cs in s['clusters']:
for u in cs['usage']:
for start, nih in u[key].items():
start_to_nih.setdefault(start, 0.0)
start_to_nih[start] += nih
s[key] = start_to_nih
# break out by label (usually script name) and owner (usually current user)
for key in ('label', 'owner'):
for nih_type in ('nih_used', 'nih_billed', 'nih_bbnu'):
key_to_nih = {}
for cs in s['clusters']:
for u in cs['usage']:
key_to_nih.setdefault(u[key], 0.0)
key_to_nih[u[key]] += u[nih_type]
s['%s_to_%s' % (key, nih_type)] = key_to_nih
# break down by job step. separate out un-pooled jobs
for nih_type in ('nih_used', 'nih_billed', 'nih_bbnu'):
job_step_to_nih = {}
job_step_to_nih_no_pool = {}
for cs in s['clusters']:
for u in cs['usage'][1:]:
job_step = (u['label'], u['step_num'])
job_step_to_nih.setdefault(job_step, 0.0)
job_step_to_nih[job_step] += u[nih_type]
if not cs['pool']:
job_step_to_nih_no_pool.setdefault(job_step, 0.0)
job_step_to_nih_no_pool[job_step] += u[nih_type]
s['job_step_to_%s' % nih_type] = job_step_to_nih
s['job_step_to_%s_no_pool' % nih_type] = job_step_to_nih_no_pool
# break down by pool
for nih_type in ('nih_used', 'nih_billed', 'nih_bbnu'):
pool_to_nih = {}
for cs in s['clusters']:
pool_to_nih.setdefault(cs['pool'], 0.0)
pool_to_nih[cs['pool']] += cs[nih_type]
s['pool_to_%s' % nih_type] = pool_to_nih
return s
def _cluster_to_full_summary(cluster, now=None):
"""Convert a cluster to a full summary for use in creating a report,
including billing/usage information.
:param cluster: a :py:mod:`boto3` cluster data structure
:param now: the current UTC time, as a :py:class:`datetime.datetime`.
Defaults to the current time.
Returns a dictionary with the keys from
:py:func:`cluster_to_basic_summary` plus:
* *nih_billed*: total normalized instances hours billed for this cluster
* *nih_used*: total normalized instance hours actually used for
bootstrapping and running jobs.
* *nih_bbnu*: total usage billed but not used (`nih_billed - nih_used`)
* *usage*: job-specific usage information, returned by
:py:func:`_cluster_to_usage_data`.
"""
cs = _cluster_to_basic_summary(cluster, now=now)
cs['usage'] = _cluster_to_usage_data(
cluster, basic_summary=cs, now=now)
# add up billing info
cs['nih_billed'] = float(sum(u['nih_billed'] for u in cs['usage']))
for nih_type in ('nih_used', 'nih_bbnu'):
cs[nih_type] = float(sum(u[nih_type] for u in cs['usage']))
return cs
def _cluster_to_basic_summary(cluster, now=None):
"""Extract fields such as creation time, owner, etc. from the cluster.
:param cluster: a :py:mod:`boto3` cluster data structure
:param now: the current UTC time, as a :py:class:`datetime.datetime`.
Defaults to the current time.
Returns a dictionary with the following keys. These will be ``None`` if the
corresponding field in the cluster is unavailable.
* *created*: UTC `datetime.datetime` that the cluster was created,
or ``None``
* *end*: UTC `datetime.datetime` that the cluster finished, or ``None``
* *id*: cluster ID, or ``None`` (this should never happen)
* *label*: The label for the cluster (usually the module name of the
:py:class:`~mrjob.job.MRJob` script that started it), or
``None`` for non-:py:mod:`mrjob` clusters.
* *name*: cluster name, or ``None`` (this should never happen)
* *nih*: number of normalized instance hours cluster *would* use if it
ran to the end of the next full hour (
* *num_steps*: Number of steps in the cluster.
* *owner*: The owner for the cluster (usually the user that started it),
or ``None`` for non-:py:mod:`mrjob` clusters.
* *pool*: pool name (e.g. ``'default'``) if the cluster is pooled,
otherwise ``None``.
* *ran*: How long the cluster ran, or has been running, as a
:py:class:`datetime.timedelta`. This will be ``timedelta(0)`` if
the cluster hasn't started.
* *ready*: UTC `datetime.datetime` that the cluster finished
bootstrapping, or ``None``
* *state*: The cluster's state as a string (e.g. ``'RUNNING'``)
"""
if now is None:
now = _boto3_now()
bcs = {} # basic cluster summary to fill in
bcs['id'] = cluster['Id']
bcs['name'] = cluster['Name']
Status = cluster['Status']
Timeline = Status.get('Timeline', {})
bcs['created'] = Timeline.get('CreationDateTime')
bcs['ready'] = Timeline.get('ReadyDateTime')
bcs['end'] = Timeline.get('EndDateTime')
if bcs['created']:
bcs['ran'] = (bcs['end'] or now) - bcs['created']
else:
bcs['ran'] = timedelta(0)
bcs['state'] = Status.get('State')
bcs['num_steps'] = len(cluster['Steps'])
bcs['pool'] = _pool_name(cluster)
m = _JOB_KEY_RE.match(bcs['name'] or '')
if m:
bcs['label'], bcs['owner'] = m.group(1), m.group(2)
else:
bcs['label'], bcs['owner'] = None, None
bcs['nih'] = float(cluster.get('NormalizedInstanceHours', 0))
return bcs
def _cluster_to_usage_data(cluster, basic_summary=None, now=None):
r"""Break billing/usage information for a cluster down by job.
:param cluster: a :py:mod:`boto3` cluster data structure
:param basic_summary: a basic summary of the cluster, returned by
:py:func:`_cluster_to_basic_summary`. If this
is ``None``, we'll call
:py:func:`_cluster_to_basic_summary` ourselves.
:param now: the current UTC time, as a :py:class:`datetime.datetime`.
Defaults to the current time.
Returns a list of dictionaries containing usage information, one for
bootstrapping, and one for each step that ran or is currently running. If
the cluster hasn't started yet, return ``[]``.
Usage dictionaries have the following keys:
* *end*: when the job finished running, or *now* if it's still running.
* *end_billing*: the effective end of the job for billing purposes, either
when the next job starts, the current time if the job
is still running, or the end of the next full hour
in the cluster.
* *nih_billed*: normalized instances hours billed for this job or
bootstrapping step
* *nih_used*: normalized instance hours actually used for running
the job or bootstrapping
* *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`)
* *date_to_nih_\**: map from a :py:class:`datetime.date` to number
of normalized instance hours billed/used/billed but not used on that date
* *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number
of normalized instance hours billed/used/billed but not used during
the hour starting at that time
* *label*: job's label (usually the module name of the job), or for the
bootstrapping step, the label of the cluster
* *owner*: job's owner (usually the user that started it), or for the
bootstrapping step, the owner of the cluster
* *start*: when the job or bootstrapping step started, as a
:py:class:`datetime.datetime`
"""
bcs = basic_summary or _cluster_to_basic_summary(cluster)
if now is None:
now = _boto3_now()
if not bcs['created']:
return []
# EMR no longer bills by the full hour, but NormalizedInstanceHours
# still works that way
full_hours = math.ceil(timedelta.total_seconds(bcs['ran']) / 60.0 / 60.0)
nih_per_sec = bcs['nih'] / (full_hours * 3600.0)
# EMR bills by the full second, and at least one minute per cluster
cluster_end_billing = bcs['created'] + max(
_round_up_to_next_second(bcs['ran']), timedelta(minutes=1))
intervals = []
# make a fake step for cluster startup and bootstrapping, so we don't
# consider that wasted.
intervals.append({
'label': bcs['label'],
'owner': bcs['owner'],
'start': bcs['created'],
'end': bcs['ready'] or bcs['end'] or now,
'step_num': None,
})
for step in cluster['Steps']:
Status = step['Status']
Timeline = Status.get('Timeline', {})
# we've reached the last step that's actually run
if not Timeline.get('StartDateTime'):
break
step_start = Timeline['StartDateTime']
step_end = Timeline.get('EndDateTime')
if step_end is None:
# step started running and was cancelled. credit it for 0 usage
if bcs['end']:
step_end = step_start
# step is still running
else:
step_end = now
m = _STEP_NAME_RE.match(step['Name'])
if m:
step_label = m.group(1)
step_owner = m.group(2)
step_num = int(m.group(6))
else:
step_label, step_owner, step_num = None, None, None
intervals.append({
'label': step_label,
'owner': step_owner,
'start': step_start,
'end': step_end,
'step_num': step_num,
})
# fill in end_billing
for i in range(len(intervals) - 1):
intervals[i]['end_billing'] = intervals[i + 1]['start']
intervals[-1]['end_billing'] = cluster_end_billing
# fill normalized usage information
for interval in intervals:
interval['nih_used'] = (
nih_per_sec *
timedelta.total_seconds(interval['end'] - interval['start']))
interval['date_to_nih_used'] = dict(
(d, nih_per_sec * secs)
for d, secs
in _subdivide_interval_by_date(interval['start'],
interval['end']).items())
interval['hour_to_nih_used'] = dict(
(d, nih_per_sec * secs)
for d, secs
in _subdivide_interval_by_hour(interval['start'],
interval['end']).items())
interval['nih_billed'] = (
nih_per_sec * timedelta.total_seconds(
interval['end_billing'] - interval['start']))
interval['date_to_nih_billed'] = dict(
(d, nih_per_sec * secs)
for d, secs
in _subdivide_interval_by_date(interval['start'],
interval['end_billing']).items())
interval['hour_to_nih_billed'] = dict(
(d, nih_per_sec * secs)
for d, secs
in _subdivide_interval_by_hour(interval['start'],
interval['end_billing']).items())
# time billed but not used
interval['nih_bbnu'] = interval['nih_billed'] - interval['nih_used']
interval['date_to_nih_bbnu'] = {}
for d, nih_billed in interval['date_to_nih_billed'].items():
nih_bbnu = nih_billed - interval['date_to_nih_used'].get(d, 0.0)
if nih_bbnu:
interval['date_to_nih_bbnu'][d] = nih_bbnu
interval['hour_to_nih_bbnu'] = {}
for d, nih_billed in interval['hour_to_nih_billed'].items():
nih_bbnu = nih_billed - interval['hour_to_nih_used'].get(d, 0.0)
if nih_bbnu:
interval['hour_to_nih_bbnu'][d] = nih_bbnu
return intervals
def _subdivide_interval_by_date(start, end):
"""Convert a time interval to a map from :py:class:`datetime.date` to
the number of seconds within the interval on that date.
*start* and *end* are :py:class:`datetime.datetime` objects.
"""
if start.date() == end.date():
date_to_secs = {start.date(): timedelta.total_seconds(end - start)}
else:
date_to_secs = {}
date_to_secs[start.date()] = timedelta.total_seconds(
datetime(start.year, start.month, start.day, tzinfo=start.tzinfo) +
timedelta(days=1) - start)
date_to_secs[end.date()] = timedelta.total_seconds(
end - datetime(end.year, end.month, end.day, tzinfo=end.tzinfo))
# fill in dates in the middle
cur_date = start.date() + timedelta(days=1)
while cur_date < end.date():
date_to_secs[cur_date] = timedelta.total_seconds(timedelta(days=1))
cur_date += timedelta(days=1)
# remove zeros
date_to_secs = dict(
(d, secs) for d, secs in date_to_secs.items() if secs)
return date_to_secs
def _subdivide_interval_by_hour(start, end):
"""Convert a time interval to a map from hours (represented as
:py:class:`datetime.datetime` for the start of the hour) to the number of
seconds during that hour that are within the interval
*start* and *end* are :py:class:`datetime.datetime` objects.
"""
start_hour = start.replace(minute=0, second=0, microsecond=0)
end_hour = end.replace(minute=0, second=0, microsecond=0)
if start_hour == end_hour:
hour_to_secs = {start_hour: timedelta.total_seconds(end - start)}
else:
hour_to_secs = {}
hour_to_secs[start_hour] = timedelta.total_seconds(
start_hour + timedelta(hours=1) - start)
hour_to_secs[end_hour] = timedelta.total_seconds(end - end_hour)
# fill in dates in the middle
cur_hour = start_hour + timedelta(hours=1)
while cur_hour < end_hour:
hour_to_secs[cur_hour] = timedelta.total_seconds(
timedelta(hours=1))
cur_hour += timedelta(hours=1)
# remove zeros
hour_to_secs = dict(
(h, secs) for h, secs in hour_to_secs.items() if secs)
return hour_to_secs
def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs):
"""Get relevant cluster information from EMR.
:param float max_days_ago: If set, don't fetch clusters created longer
than this many days ago.
:param now: the current UTC time, as a :py:class:`datetime.datetime`.
Defaults to the current time.
:param runner_kwargs: keyword args to pass through to
:py:class:`~mrjob.emr.EMRJobRunner`
"""
if now is None:
now = _boto3_now()
emr_client = EMRJobRunner(**runner_kwargs).make_emr_client()
# if --max-days-ago is set, only look at recent jobs
created_after = None
if max_days_ago is not None:
created_after = now - timedelta(days=max_days_ago)
# use _DELAY to sleep 1 second after each API call (see #1091). Could
# implement some sort of connection wrapper for this if it becomes more
# generally useful.
list_clusters_kwargs = dict(_delay=_DELAY)
if created_after is not None:
list_clusters_kwargs['CreatedAfter'] = created_after
for cluster_summary in _boto3_paginate(
'Clusters', emr_client, 'list_clusters', **list_clusters_kwargs):
cluster_id = cluster_summary['Id']
cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']
sleep(_DELAY)
cluster['Steps'] = list(reversed(list(_boto3_paginate(
'Steps', emr_client, 'list_steps',
ClusterId=cluster_id, _delay=_DELAY))))
yield cluster
def _print_report(stats, now=None):
"""Print final report.
:param stats: a dictionary returned by :py:func:`_clusters_to_stats`
:param now: the current UTC time, as a :py:class:`datetime.datetime`.
Defaults to the current time.
"""
if now is None:
now = _boto3_now()
s = stats
if not s['clusters']:
print('No clusters created in the past two months!')
return
print('Total # of Clusters: %d' % len(s['clusters']))
print()
print('* All times are in UTC.')
print()
print('Min create time: %s' % min(cs['created'] for cs in s['clusters']))
print('Max create time: %s' % max(cs['created'] for cs in s['clusters']))
print(' Current time: %s' % now.replace(microsecond=0))
print()
print('* All usage is measured in Normalized Instance Hours, which are')
print(' roughly equivalent to running an m1.medium instance for an hour.')
print(" Billing is estimated, and may not match Amazon's system exactly.")
print()
# total compute-unit hours used
def with_pct(usage):
return (usage, _percent(usage, s['nih_billed']))
print('Total billed: %9.2f %5.1f%%' % with_pct(s['nih_billed']))
print(' Total used: %9.2f %5.1f%%' % with_pct(s['nih_used']))
print(' bootstrap: %9.2f %5.1f%%' % with_pct(s['bootstrap_nih_used']))
print(' jobs: %9.2f %5.1f%%' % with_pct(s['job_nih_used']))
print(' Total waste: %9.2f %5.1f%%' % with_pct(s['nih_bbnu']))
print(' at end: %9.2f %5.1f%%' % with_pct(s['end_nih_bbnu']))
print(' other: %9.2f %5.1f%%' % with_pct(s['other_nih_bbnu']))
print()
if s['date_to_nih_billed']:
print('Daily statistics:')
print()
print(' date billed used waste % waste')
d = max(s['date_to_nih_billed'])
while d >= min(s['date_to_nih_billed']):
print(' %10s %9.2f %9.2f %9.2f %5.1f' % (
d,
s['date_to_nih_billed'].get(d, 0.0),
s['date_to_nih_used'].get(d, 0.0),
s['date_to_nih_bbnu'].get(d, 0.0),
_percent(s['date_to_nih_bbnu'].get(d, 0.0),
s['date_to_nih_billed'].get(d, 0.0))))
d -= timedelta(days=1)
print()
if s['hour_to_nih_billed']:
print('Hourly statistics:')
print()
print(' hour billed used waste % waste')
h = max(s['hour_to_nih_billed'])
while h >= min(s['hour_to_nih_billed']):
print(' %13s %9.2f %9.2f %9.2f %5.1f' % (
h.strftime('%Y-%m-%d %H'),
s['hour_to_nih_billed'].get(h, 0.0),
s['hour_to_nih_used'].get(h, 0.0),
s['hour_to_nih_bbnu'].get(h, 0.0),
_percent(s['hour_to_nih_bbnu'].get(h, 0.0),
s['hour_to_nih_billed'].get(h, 0.0))))
h -= timedelta(hours=1)
print()
print('* clusters are considered to belong to the user and job that')
print(' started them or last ran on them.')
print()
# Top jobs
print('Top jobs, by total time used:')
for label, nih_used in sorted(s['label_to_nih_used'].items(),
key=lambda lb_nih: (-lb_nih[1], lb_nih[0])):
print(' %9.2f %s' % (nih_used, label))
print()
print('Top jobs, by time billed but not used:')
for label, nih_bbnu in sorted(
s['label_to_nih_bbnu'].items(),
key=lambda lb_nih1: (-lb_nih1[1], lb_nih1[0])):
print(' %9.2f %s' % (nih_bbnu, label))
print()
# Top users
print('Top users, by total time used:')
for owner, nih_used in sorted(s['owner_to_nih_used'].items(),
key=lambda o_nih: (-o_nih[1], o_nih[0])):
print(' %9.2f %s' % (nih_used, owner))
print()
print('Top users, by time billed but not used:')
for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].items(),
key=lambda o_nih2: (-o_nih2[1], o_nih2[0])):
print(' %9.2f %s' % (nih_bbnu, owner))
print()
# Top job steps
print('Top job steps, by total time used (step number first):')
for (label, step_num), nih_used in sorted(
s['job_step_to_nih_used'].items(),
key=lambda k_nih: (-k_nih[1], k_nih[0])):
if label:
print(' %9.2f %3d %s' % (nih_used, step_num, label))
else:
print(' %9.2f (non-mrjob step)' % (nih_used,))
print()
print('Top job steps, by total time billed but not used (un-pooled only):')
for (label, step_num), nih_bbnu in sorted(
s['job_step_to_nih_bbnu_no_pool'].items(),
key=lambda k_nih3: (-k_nih3[1], k_nih3[0])):
if label:
print(' %9.2f %3d %s' % (nih_bbnu, step_num, label))
else:
print(' %9.2f (non-mrjob step)' % (nih_bbnu,))
print()
# Top pools
print('All pools, by total time billed:')
for pool, nih_billed in sorted(s['pool_to_nih_billed'].items(),
key=lambda p_nih: (-p_nih[1], p_nih[0])):
print(' %9.2f %s' % (nih_billed, pool or '(not pooled)'))
print()
print('All pools, by total time billed but not used:')
for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].items(),
key=lambda p_nih4: (-p_nih4[1], p_nih4[0])):
print(' %9.2f %s' % (nih_bbnu, pool or '(not pooled)'))
print()
# Top clusters
print('All clusters, by total time billed:')
top_clusters = sorted(s['clusters'],
key=lambda cs: (-cs['nih_billed'], cs['name']))
for cs in top_clusters:
print(' %9.2f %-15s %s' % (
cs['nih_billed'], cs['id'], cs['name']))
print()
print('All clusters, by time billed but not used:')
top_clusters_bbnu = sorted(
s['clusters'], key=lambda cs: (-cs['nih_bbnu'], cs['name']))
for cs in top_clusters_bbnu:
print(' %9.2f %-15s %s' % (
cs['nih_bbnu'], cs['id'], cs['name']))
print()
# Details
print('Details for all clusters:')
print()
print(' id state created steps'
' time ran billed waste user name')
all_clusters = sorted(s['clusters'], key=lambda cs: cs['created'],
reverse=True)
for cs in all_clusters:
print(' %-15s %-22s %19s %3d %17s %9.2f %9.2f %8s %s' % (
cs['id'], cs['state'], cs['created'], cs['num_steps'],
strip_microseconds(cs['ran']), cs['nih_used'], cs['nih_bbnu'],
(cs['owner'] or ''), (cs['label'] or ('not started by mrjob'))))
def _percent(x, total, default=0.0):
"""Return what percentage *x* is of *total*, or *default* if
*total* is zero."""
if total:
return 100.0 * x / total
else:
return default
def _round_up_to_next_second(td):
"""Round up to the next second because that's how EMR bills."""
if td.microseconds:
return strip_microseconds(td) + timedelta(seconds=1)
else:
return td
if __name__ == '__main__':
main()