GildedRose-Refactoring-Kata/venv/Lib/site-packages/mrjob/tools/emr/s3_tmpwatch.py

154 lines
5.0 KiB
Python

# Copyright 2010-2012 Yelp
# Copyright 2013 David Marin and Steve Johnson
# Copyright 2015-2018 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Delete all files in a given URI that are older than a specified time. The
time parameter defines the threshold for removing files. If the file has not
been accessed for *time*, the file is removed. The time argument is a number
with an optional single-character suffix specifying the units: m for minutes,
h for hours, d for days. If no suffix is specified, time is in hours.
Suggested usage: run this as a cron job with the -q option::
0 0 * * * mrjob s3-tmpwatch -q 30d s3://your-bucket/tmp/
Usage::
mrjob s3-tmpwatch [options] <time-untouched> <URIs>
Options::
-c CONF_PATHS, --conf-path CONF_PATHS
Path to alternate mrjob.conf file to read from
--no-conf Don't load mrjob.conf even if it's available
-h, --help show this help message and exit
-q, --quiet Don't print anything to stderr
--region REGION GCE/AWS region to run Dataproc/EMR jobs in.
--s3-endpoint S3_ENDPOINT
Force mrjob to connect to S3 on this endpoint (e.g. s3
-us-west-1.amazonaws.com). You usually shouldn't set
this; by default mrjob will choose the correct
endpoint for each S3 bucket based on its location.
-t, --test Don't actually delete any files; just log that we
would
-v, --verbose print more messages to stderr
"""
from argparse import ArgumentParser
from datetime import timedelta
import logging
from mrjob.aws import _boto3_now
from mrjob.emr import EMRJobRunner
from mrjob.job import MRJob
from mrjob.options import _add_basic_args
from mrjob.options import _add_runner_args
from mrjob.options import _alphabetize_actions
log = logging.getLogger(__name__)
def main(cl_args=None):
arg_parser = _make_arg_parser()
options = arg_parser.parse_args(cl_args)
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
time_old = _process_time(options.time_untouched)
for path in options.uris:
_s3_cleanup(path, time_old,
dry_run=options.test,
**_runner_kwargs(options))
def _s3_cleanup(glob_path, time_old, dry_run=False, **runner_kwargs):
"""Delete all files older than *time_old* in *path*.
If *dry_run* is true, then just log the files that need to be
deleted without actually deleting them
"""
runner = EMRJobRunner(**runner_kwargs)
log.info('Deleting all files in %s that are older than %s' %
(glob_path, time_old))
for path, key in runner.fs.s3._ls(glob_path):
age = _boto3_now() - key.last_modified
if age > time_old:
# Delete it
log.info('Deleting %s; is %s old' % (path, age))
if not dry_run:
key.delete()
def _runner_kwargs(options):
"""Options to pass to the EMRJobRunner."""
kwargs = options.__dict__.copy()
for unused_arg in ('quiet', 'verbose', 'test'):
del kwargs[unused_arg]
return kwargs
def _process_time(time):
if time[-1] == 'm':
return timedelta(minutes=int(time[:-1]))
elif time[-1] == 'h':
return timedelta(hours=int(time[:-1]))
elif time[-1] == 'd':
return timedelta(days=int(time[:-1]))
else:
return timedelta(hours=int(time))
def _make_arg_parser():
usage = '%(prog)s s3-tmpwatch [options] TIME_UNTOUCHED URI [URI ...]'
description = (
'Delete all files at one or more URIs that are older than a'
' specified time.')
arg_parser = ArgumentParser(usage=usage, description=description)
arg_parser.add_argument(
'-t', '--test', dest='test', default=False,
action='store_true',
help="Don't actually delete any files; just log that we would")
arg_parser.add_argument(
dest='time_untouched',
help='The time threshold for removing'
' files. A number with an optional'
' single-character suffix specifying the units: m for minutes, h for'
' hours, d for days. If no suffix is specified, time is in hours.')
arg_parser.add_argument(
dest='uris', nargs='+',
help='s3:// URIs specifying where to delete old files')
_add_basic_args(arg_parser)
_add_runner_args(
arg_parser,
set(['region', 's3_endpoint']),
)
_alphabetize_actions(arg_parser)
return arg_parser
if __name__ == '__main__':
main()