mapreduce design patterns n.
Download
Skip this Video
Loading SlideShow in 5 Seconds..
MapReduce Design Patterns PowerPoint Presentation
Download Presentation
MapReduce Design Patterns

Loading in 2 Seconds...

play fullscreen
1 / 196

MapReduce Design Patterns - PowerPoint PPT Presentation


  • 480 Views
  • Uploaded on

MapReduce Design Patterns. CMSC 491/691 Hadoop-Based Distributed Computing Spring 2014 Adam Shook. Agenda. Summarization Patterns Filtering Patterns Data Organization Patterns Joins Patterns Metapatterns I/O Patterns Bloom Filters.

loader
I am the owner, or an agent authorized to act on behalf of the owner, of the copyrighted work described.
capcha
Download Presentation

MapReduce Design Patterns


An Image/Link below is provided (as is) to download presentation

Download Policy: Content on the Website is provided to you AS IS for your information and personal use and may not be sold / licensed / shared on other websites without getting consent from its author.While downloading, if for some reason you are not able to download a presentation, the publisher may have deleted the file from their server.


- - - - - - - - - - - - - - - - - - - - - - - - - - E N D - - - - - - - - - - - - - - - - - - - - - - - - - -
    Presentation Transcript
    1. MapReduce Design Patterns CMSC 491/691 Hadoop-Based Distributed Computing Spring 2014 Adam Shook

    2. Agenda • Summarization Patterns • Filtering Patterns • Data Organization Patterns • Joins Patterns • Metapatterns • I/O Patterns • Bloom Filters

    3. Numerical Summarizations, Inverted Index, Counting with Counters Summarization Patterns

    4. Overview • Top-down summarization of large data sets • Most straightforward patterns • Calculate aggregates over entire data set or groups • Build indexes

    5. Numerical Summarizations • Group records together by a field or set of fields and calculate a numerical aggregate per group • Build histograms or calculate statistics from numerical values

    6. Known Uses • Word Count • Record Count • Min/Max/Count • Average/Median/Standard Deviation

    7. Structure

    8. Performance • Perform well, especially when combiner is used • Need to be concerned about data skew with from the key

    9. Example • Discover the first time a StackOverflow user posted, the last time a user posted, and the number of posts in between • User ID, Min Date, Max Date, Count

    10. public class MinMaxCountTuple implements Writable { private Date min = new Date();private Date max = new Date();private long count = 0; private final static SimpleDateFormatfrmt = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSS"); public Date getMin() { return min; } public void setMin(Date min) { this.min = min; } public Date getMax() { return max; } public void setMax(Date max) { this.max = max; } public long getCount() { return count; } public void setCount(long count) { this.count = count; } public void readFields(DataInput in) { min = new Date(in.readLong()); max = new Date(in.readLong()); count = in.readLong(); } public void write(DataOutputout) { out.writeLong(min.getTime()); out.writeLong(max.getTime()); out.writeLong(count); } public String toString() {return frmt.format(min) + "\t" + frmt.format(max) + "\t" + count; } }

    11. public static class MinMaxCountMapper extends Mapper<Object, Text, Text, MinMaxCountTuple> { private Text outUserId = new Text();private MinMaxCountTupleoutTuple= new MinMaxCountTuple(); private final static SimpleDateFormatfrmt= new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS"); public void map(Object key, Text value, Context context) { Map<String, String> parsed = xmlToMap(value.toString()); String strDate = parsed.get("CreationDate"); String userId = parsed.get("UserId"); Date creationDate = frmt.parse(strDate); outTuple.setMin(creationDate); outTuple.setMax(creationDate) outTuple.setCount(1);outUserId.set(userId); context.write(outUserId, outTuple); } }

    12. public static class MinMaxCountReducerextends Reducer<Text, MinMaxCountTuple, Text, MinMaxCountTuple> { private MinMaxCountTuple result = new MinMaxCountTuple(); public void reduce(Text key, Iterable<MinMaxCountTuple> values, Context context) { result.setMin(null); result.setMax(null); result.setCount(0); intsum=0; for (MinMaxCountTupleval : values) {if (result.getMin() == null || val.getMin().compareTo(result.getMin()) < 0) { result.setMin(val.getMin()); } if (result.getMax() == null || val.getMax().compareTo(result.getMax()) > 0) { result.setMax(val.getMax()); } sum += val.getCount(); } result.setCount(sum); context.write(key, result); } }

    13. public static void main(String[] args) { Configuration conf = new Configuration(); String[] otherArgs = newGenericOptionsParser(conf, args) .getRemainingArgs(); if(otherArgs.length != 2) { System.err.println("Usage: MinMaxCountDriver <in> <out>"); System.exit(2); } Job job = new Job(conf, "Comment Date Min Max Count"); job.setJarByClass(MinMaxCountDriver.class); job.setMapperClass(MinMaxCountMapper.class); job.setCombinerClass(MinMaxCountReducer.class); job.setReducerClass(MinMaxCountReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(MinMaxCountTuple.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }

    14. -- Filename: MinMaxCount.pig A = LOAD '$input' USINGPigStorage(',') AS (name:chararray, age:int); B = GROUP A BY name; C = FOREACH B GENERATE group AS name, MIN(A.age), MAX(A.age), COUNT(A); STORE C INTO '$output'; -- Execution -- pig –f MinMaxCount.pig –p input=users.txt –p output=pig-out

    15. -- Filename: MinMaxCount.hql DROP TABLE IF EXISTS users; CREATE EXTERNAL TABLE users (name STRING, age INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION'/user/shadam1/hive-tweets'; -- Directory containing data INSERT OVERWRITE DIRECTORY '/user/shadam1/hive-out' SELECT name, MIN(age), MAX(age), COUNT(*) FROM users GROUP BY id; -- Execution -- hive –f MinMaxCount.hql

    16. Inverted Index • Generate an index from a data set to enable fast searches or data enrichment • Building an index takes time, but can greatly reduce the amount of time to search for something • Output can be ingested into key/value store

    17. Structure

    18. Performance • Depends on how complex it is to parse the content into the mapper and how many indices you are building per record • Possibility of a data explosion if indexing many fields

    19. Example • Extract URLS from StackOverflow comments that reference a Wikipedia page • Wikipedia URL -> List of comment IDs

    20. public static class WikipediaExtractor extends Mapper<Object, Text, Text, Text> { private Text link = new Text(); private Text outvalue= new Text(); public void map(Object key, Text value, Context context) { Map<String, String> parsed = xmlToMap(value.toString()); String txt = parsed.get("Body"); String posttype = parsed.get("PostTypeId"); String row_id = parsed.get("Id"); if (txt == null || (posttype != null && posttype.equals("1"))) { return; } txt = StringEscapeUtils.unescapeHtml(txt.toLowerCase()); link.set(getWikipediaURL(txt)); outvalue.set(row_id); context.write(link, outvalue); } }

    21. public static class Concatenator extends Reducer<Text,Text,Text,Text> { private Text result = new Text(); public void reduce(Text key, Iterable<Text> values, Context context) { StringBuildersb = new StringBuilder(); booleanfirst = true;for (Text id : values) { if (first) { first = false; } else { sb.append(" "); } sb.append(id.toString()); } result.set(sb.toString()); context.write(key, result); } }

    22. Combiner • Can be used to do concatenation prior to the reduce phase

    23. Counting with Counters • Use MapReduce framework’s counter utility to calculate global sum entirely on the map side, producing no output • Small number of counters only!!

    24. Known Uses • Count number of records • Count a small number of unique field instances • Sum fields of data together

    25. Structure

    26. Performance • Map-only job • Produces no output • About as fast as you can get

    27. Example • Count the number of StackOverflow users by state

    28. public static class CountNumUsersByStateMapper extends Mapper<Object, Text, NullWritable, NullWritable> { private String[] statesArray = new String[] { ... }; private HashSet<String> states = new HashSet<String>(Arrays.asList(statesArray)); public void map(Object key, Text value, Context context) { Map<String, String> parsed = xmlToMap(value.toString()); String location = parsed.get("Location"); if (location != null && !location.isEmpty()) { String[] tokens = location.toUpperCase().split("\\s"); booleanunknown = true;for (String state : tokens) { if (states.contains(state)) {context.getCounter(STATE_COUNTER_GROUP, state).increment(1); unknown = false; break; } } if (unknown) {context.getCounter(STATE_COUNTER_GROUP, UNKNOWN_COUNTER).increment(1); } } else { context.getCounter(STATE_COUNTER_GROUP, NULL_OR_EMPTY_COUNTER).increment(1); } } }

    29. ... // Job configuration intcode = job.waitForCompletion(true) ? 0 : 1; if (code == 0) {for (Counter counter : job.getCounters().getGroup( CountNumUsersByStateMapper.STATE_COUNTER_GROUP)) { System.out.println(counter.getDisplayName() + "\t" + counter.getValue()); } } // Clean up empty output directory FileSystem.get(conf).delete(outputDir, true); System.exit(code);

    30. Filtering, Bloom Filtering, Top Ten, Distinct Filtering Patterns

    31. Filtering • Discard records that are not of interest • Create subsets of your big data sets that you want to further analyze

    32. Known Uses • Closer view of the data • Tracking a thread of events • Distributed grep • Data cleansing • Simple random sampling

    33. Structure

    34. Performance • Generally map-only • Need to be aware of the size and number of output files

    35. Example • Applying a configurable regular expression to lines of text

    36. public static class GrepMapper extends Mapper<Object, Text, NullWritable, Text> { private String mapRegex = null; public void setup(Context context) { mapRegex = context.getConfiguration().get("mapregex"); } public void map(Object key, Text value, Context context) { if (value.toString().matches(mapRegex)) { context.write(NullWritable.get(), value); } } }

    37. Bloom Filtering • Keep records that are a member of a large predefined set of values • Inherent possibility of false positives

    38. Known Uses • Removing most of the non-watched values • Pre-filtering a data set prior to expensive membership test

    39. Structure

    40. Performance • Similar to simple filtering • Loading of the Bloom filter is relatively inexpensive and checking a Bloom filter is O(1)

    41. Example • Filter out StackOverflow comments that do not contain at least one keyword

    42. public class BloomFilterDriver{public static void main(String[] args) throws Exception { Path inputFile = new Path(args[0]);intnumMembers = Integer.parseInt(args[1]); float falsePosRate = Float.parseFloat(args[2]); Path bfFile = new Path(args[3]); intvectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate); intnbHash = getOptimalK(numMembers, vectorSize); BloomFilterfilter = new BloomFilter(vectorSize, nbHash, Hash.MURMUR_HASH); String line = null;intnumElements = 0;FileSystemfs = FileSystem.get(new Configuration()); BufferedReaderrdr = new BufferedReader(new InputStreamReader( new GZIPInputStream(fs.open(inputFile)))); while ((line = rdr.readLine()) != null) { filter.add(new Key(line.getBytes())); } rdr.close(); FSDataOutputStreamstrm = fs.create(bfFile); filter.write(strm); strm.flush(); strm.close(); System.exit(0); } }

    43. public static class BloomFilteringMapper extends Mapper<Object, Text, Text, NullWritable> { private BloomFilter filter = new BloomFilter(); protected void setup(Context context) { Path[] files = DistributedCache.getLocalCacheFiles(context.getConfiguration()); DataInputStreamstrm = new DataInputStream(new FileInputStream(files[0])); filter.readFields(strm); strm.close(); } public void map(Object key, Text value, Context context) { Map<String, String> parsed = xmlToMap(value.toString()); String comment = parsed.get("Text");StringTokenizertokenizer = new StringTokenizer(comment); while (tokenizer.hasMoreTokens()) { String word = tokenizer.nextToken();if (filter.membershipTest(new Key(word.getBytes()))) { context.write(value, NullWritable.get()); break; } } } }

    44. Top Ten • Retrieve a relatively small number of top K records based on a ranking scheme • Find the outliers or most interesting records

    45. Known Uses • Outlier analysis • Selecting interesting data • Catchy dashboards

    46. Structure

    47. Performance • Use of a single reducer has some limitations on just how big K can be

    48. Example • Top ten StackOverflow users by reputation

    49. public static class TopTenMapper extends Mapper<Object, Text, NullWritable, Text> { private TreeMap<Integer, Text> repToRecordMap= new TreeMap<Integer, Text>(); public void map(Object key, Text value, Context context) { Map<String, String> parsed = xmlToMap(value.toString()); String userId = parsed.get("Id"); String reputation = parsed.get("Reputation");repToRecordMap.put(Integer.parseInt(reputation), new Text(value)); if (repToRecordMap.size() > 10) { repToRecordMap.remove(repToRecordMap.firstKey()); } } protected void cleanup(Context context) { for (Text t : repToRecordMap.values()) { context.write(NullWritable.get(), t); } } }