# Copyright 2009-2013 Yelp and Contributors
# Copyright 2015-2018 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create a persistent EMR cluster to run clusters in, and print its ID to
stdout.

Usage::

    mrjob create-cluster

Options::

  --additional-emr-info ADDITIONAL_EMR_INFO
                        A JSON string for selecting additional features on EMR
  --applications APPLICATIONS, --application APPLICATIONS
                        Additional applications to run on 4.x and 5.x AMIs,
                        separated by commas (e.g. "Ganglia,Spark")
  --bootstrap BOOTSTRAP
                        A shell command to set up libraries etc. before any
                        steps (e.g. "sudo apt-get -qy install python3"). You
                        may interpolate files available via URL or locally
                        with Hadoop Distributed Cache syntax ("sudo yum
                        install -y foo.rpm#")
  --bootstrap-action BOOTSTRAP_ACTIONS
                        Raw bootstrap action scripts to run before any of the
                        other bootstrap steps. You can use --bootstrap-action
                        more than once. Local scripts will be automatically
                        uploaded to S3. To add arguments, just use quotes:
                        "foo.sh arg1 arg2"
  --bootstrap-mrjob     Automatically zip up the mrjob library and install it
                        when we run the mrjob. This is the default. Use --no-
                        bootstrap-mrjob if you've already installed mrjob on
                        your Hadoop cluster.
  --no-bootstrap-mrjob  Don't automatically zip up the mrjob library and
                        install it when we run this job. Use this if you've
                        already installed mrjob on your Hadoop cluster.
  --bootstrap-python    Attempt to install a compatible version of Python at
                        bootstrap time. Currently this only does anything for
                        Python 3, for which it is enabled by default.
  --no-bootstrap-python
                        Don't automatically try to install a compatible
                        version of Python at bootstrap time.
  --bootstrap-spark     Auto-install Spark on the cluster (even if not
                        needed).
  --no-bootstrap-spark  Don't auto-install Spark on the cluster.
  --cloud-fs-sync-secs CLOUD_FS_SYNC_SECS
                        How long to wait for remote FS to reach eventual
                        consistency. This is typically less than a second but
                        the default is 5.0 to be safe.
  --cloud-log-dir CLOUD_LOG_DIR
                        URI on remote FS to write logs into
  --cloud-part-size-mb CLOUD_PART_SIZE_MB
                        Upload files to cloud FS in parts no bigger than this
                        many megabytes. Default is 100 MiB. Set to 0 to
                        disable multipart uploading entirely.
  --cloud-upload-part-size CLOUD_PART_SIZE_MB
                        Deprecated alias for --cloud-part-size-mb
  --cloud-tmp-dir CLOUD_TMP_DIR
                        URI on remote FS to use as our temp directory.
  -c CONF_PATHS, --conf-path CONF_PATHS
                        Path to alternate mrjob.conf file to read from
  --no-conf             Don't load mrjob.conf even if it's available
  --core-instance-bid-price CORE_INSTANCE_BID_PRICE
                        Bid price to specify for core nodes when setting them
                        up as EC2 spot instances (you probably only want to do
                        this for task instances).
  --core-instance-type CORE_INSTANCE_TYPE
                        Type of GCE/EC2 core instance(s) to launch
  --ebs-root-volume-gb EBS_ROOT_VOLUME_GB
                        Size of root EBS volume, in GiB. Must be an
                        integer.Set to 0 to use the default
  --ec2-endpoint EC2_ENDPOINT
                        Force mrjob to connect to EC2 on this endpoint (e.g.
                        ec2.us-west-1.amazonaws.com). Default is to infer this
                        from region.
  --ec2-key-pair EC2_KEY_PAIR
                        Name of the SSH key pair you set up for EMR
  --emr-action-on-failure EMR_ACTION_ON_FAILURE
                        Action to take when a step fails (e.g.
                        TERMINATE_CLUSTER, CANCEL_AND_WAIT, CONTINUE)
  --emr-configuration EMR_CONFIGURATIONS
                        Configuration to use on 4.x AMIs as a JSON-encoded
                        dict; see http://docs.aws.amazon.com/ElasticMapReduce/
                        latest/ReleaseGuide/emr-configure-apps.html for
                        examples
  --emr-endpoint EMR_ENDPOINT
                        Force mrjob to connect to EMR on this endpoint (e.g.
                        us-west-1.elasticmapreduce.amazonaws.com). Default is
                        to infer this from region.
  --enable-emr-debugging
                        Enable storage of Hadoop logs in SimpleDB
  --disable-emr-debugging
                        Disable storage of Hadoop logs in SimpleDB (the
                        default)
  --extra-cluster-param EXTRA_CLUSTER_PARAMS
                        extra parameter to pass to cloud API when creating a
                        cluster, to access features not currently supported by
                        mrjob. Takes the form <param>=<value>, where value is
                        JSON or a string. Use <param>=null to unset a
                        parameter
  -h, --help            show this help message and exit
  --iam-endpoint IAM_ENDPOINT
                        Force mrjob to connect to IAM on this endpoint (e.g.
                        iam.us-gov.amazonaws.com)
  --iam-instance-profile IAM_INSTANCE_PROFILE
                        EC2 instance profile to use for the EMR cluster -- see
                        "Configure IAM Roles for Amazon EMR" in AWS docs
  --iam-service-role IAM_SERVICE_ROLE
                        IAM service role to use for the EMR cluster -- see
                        "Configure IAM Roles for Amazon EMR" in AWS docs
  --image-id IMAGE_ID   ID of custom AWS machine image (AMI) to use
  --image-version IMAGE_VERSION
                        version of EMR/Dataproc machine image to run
  --instance-fleets INSTANCE_FLEETS
                        detailed JSON list of instance fleets, including EBS
                        configuration. See docs for --instance-fleets at
                        http://docs.aws.amazon.com/cli/latest/reference/emr
                        /create-cluster.html
  --instance-groups INSTANCE_GROUPS
                        detailed JSON list of EMR instance configs, including
                        EBS configuration. See docs for --instance-groups at
                        http://docs.aws.amazon.com/cli/latest/reference/emr
                        /create-cluster.html
  --instance-type INSTANCE_TYPE
                        Type of GCE/EC2 instance(s) to launch GCE - e.g.
                        n1-standard-1, n1-highcpu-4, n1-highmem-4 -- See
                        https://cloud.google.com/compute/docs/machine-types
                        EC2 - e.g. m1.medium, c3.xlarge, r3.xlarge -- See
                        http://aws.amazon.com/ec2/instance-types/
  --label LABEL         Alternate label for the job, to help us identify it.
  --master-instance-bid-price MASTER_INSTANCE_BID_PRICE
                        Bid price to specify for the master node when setting
                        it up as an EC2 spot instance (you probably only want
                        to do this for task instances).
  --master-instance-type MASTER_INSTANCE_TYPE
                        Type of GCE/EC2 master instance to launch
  --max-mins-idle MAX_MINS_IDLE
                        If we create a cluster, have it automatically
                        terminate itself after it's been idle this many
                        minutes
  --num-core-instances NUM_CORE_INSTANCES
                        Total number of core instances to launch
  --num-task-instances NUM_TASK_INSTANCES
                        Total number of task instances to launch
  --owner OWNER         User who ran the job (default is the current user)
  --pool-clusters       Add to an existing cluster or create a new one that
                        does not terminate when the job completes.
  --no-pool-clusters    Don't run job on a pooled cluster (the default)
  --pool-name POOL_NAME
                        Specify a pool name to join. Default is "default"
  -q, --quiet           Don't print anything to stderr
  --region REGION       GCE/AWS region to run Dataproc/EMR jobs in.
  --release-label RELEASE_LABEL
                        Release Label (e.g. "emr-4.0.0"). Overrides --image-
                        version
  --s3-endpoint S3_ENDPOINT
                        Force mrjob to connect to S3 on this endpoint (e.g. s3
                        -us-west-1.amazonaws.com). You usually shouldn't set
                        this; by default mrjob will choose the correct
                        endpoint for each S3 bucket based on its location.
  --subnet SUBNET       ID of Amazon VPC subnet/URI of Google Compute Engine
                        subnetwork to launch cluster in.
  --subnets SUBNET      Like --subnet, but with a comma-separated list, to
                        specify multiple subnets in conjunction with
                        --instance-fleets (EMR only)
  --tag TAGS            Metadata tags to apply to the EMR cluster; should take
                        the form KEY=VALUE. You can use --tag multiple times
  --task-instance-bid-price TASK_INSTANCE_BID_PRICE
                        Bid price to specify for task nodes when setting them
                        up as EC2 spot instances
  --task-instance-type TASK_INSTANCE_TYPE
                        Type of GCE/EC2 task instance(s) to launch
  -v, --verbose         print more messages to stderr
  --zone ZONE           GCE zone/AWS availability zone to run Dataproc/EMR
                        jobs in.
"""
from __future__ import print_function

from argparse import ArgumentParser

from mrjob.emr import EMRJobRunner
from mrjob.job import MRJob
from mrjob.options import _add_basic_args
from mrjob.options import _add_runner_args
from mrjob.options import _alphabetize_actions
from mrjob.options import _filter_by_role


def main(args=None):
    """Run the create_cluster tool with arguments from ``sys.argv`` and
    printing to ``sys.stdout``."""
    runner = EMRJobRunner(**_runner_kwargs(args))
    cluster_id = runner.make_persistent_cluster()
    print(cluster_id)


def _runner_kwargs(cl_args=None):
    """Parse command line arguments into arguments for
    :py:class:`EMRJobRunner`
    """
    # parser command-line args
    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    # create the persistent job
    kwargs = options.__dict__.copy()

    del kwargs['quiet']
    del kwargs['verbose']

    return kwargs


def _make_arg_parser():
    usage = '%(prog)s create-cluster [options]'
    description = (
        'Create a persistent EMR cluster to run jobs in, and print its ID to'
        ' stdout.')
    arg_parser = ArgumentParser(usage=usage, description=description)

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect', 'launch'))

    _alphabetize_actions(arg_parser)

    return arg_parser


if __name__ == '__main__':
    main()