mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-23 02:11:10 +00:00
53 lines
1.6 KiB
Python
53 lines
1.6 KiB
Python
# Copyright 2016 Yelp
|
|
# Copyright 2019 Yelp
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Word frequency count with output arranged into subdirectories by
|
|
first letter, using nicknack.MultipleValueOutputFormat
|
|
"""
|
|
import json
|
|
import re
|
|
|
|
from mrjob.job import MRJob
|
|
from mrjob.protocol import RawValueProtocol
|
|
|
|
# restrict to a-z so we don't get odd filenames
|
|
WORD_RE = re.compile(r"[A-Za-z]+")
|
|
|
|
|
|
class MRNickNack(MRJob):
|
|
|
|
# tell hadoop to massage our mrjob output using this output format
|
|
HADOOP_OUTPUT_FORMAT = 'nicknack.MultipleValueOutputFormat'
|
|
|
|
# use the nicknack JAR in this directory
|
|
#
|
|
# This JAR was downloaded from https://github.com/empiricalresults/nicknack
|
|
# and is also under the Apache 2.0 license.
|
|
LIBJARS = ['nicknack-1.0.1.jar']
|
|
|
|
# tell mrjob not to format our output -- leave that to hadoop
|
|
OUTPUT_PROTOCOL = RawValueProtocol
|
|
|
|
def mapper(self, _, line):
|
|
for word in WORD_RE.findall(line):
|
|
yield (word.lower(), 1)
|
|
|
|
def reducer(self, word, counts):
|
|
total = sum(counts)
|
|
yield None, '\t'.join([word[0], json.dumps(word), json.dumps(total)])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
MRNickNack.run()
|