In the second part of the series, we will refine the program written earlier. For the all the work in this post, we will be using the Cloudera sandbox, Cloudera QuickStart VM 5.12
We notice from the results of the word count from the last exercise that it has punctuation marks and this leads to incorrect classification of the words. Like for example, you can see from the results below that the word I is classified as "I, I and I; with different count values:
Ideally, we should only have a single I as they are all same. So, in this post, we will strip the input of all the punctuation characters and then attempt the word count. Only the mapper program will be refined with the reducer program and driver program remaining the same
The Mapper program that is shown below:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SecondWordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line.replaceAll("[^A-Za-z0-9]", " "));
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(new Text(word),new IntWritable(1));
}
}
}
The Reducer program is shown below:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class SecondWordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{
int count = 0;
for (IntWritable value:values)
{
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
The Driver program is shown below:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SecondWordCountDriver {
public static void main(String[] args) throws Exception {
if(args.length !=2){
System.err.println("Enter 'input path' and 'output path' arguments");
System.exit(0);
}
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"Second Word Count");
job.setJarByClass(SecondWordCountDriver.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(SecondWordCountMapper.class);
job.setReducerClass(SecondWordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
The output is shown below:
Note that all punctuation characters are gone, and now I has a count of 10 in line with the aim of this post
We notice from the results of the word count from the last exercise that it has punctuation marks and this leads to incorrect classification of the words. Like for example, you can see from the results below that the word I is classified as "I, I and I; with different count values:
Ideally, we should only have a single I as they are all same. So, in this post, we will strip the input of all the punctuation characters and then attempt the word count. Only the mapper program will be refined with the reducer program and driver program remaining the same
The Mapper program that is shown below:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SecondWordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line.replaceAll("[^A-Za-z0-9]", " "));
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(new Text(word),new IntWritable(1));
}
}
}
The Reducer program is shown below:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class SecondWordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{
int count = 0;
for (IntWritable value:values)
{
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
The Driver program is shown below:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SecondWordCountDriver {
public static void main(String[] args) throws Exception {
if(args.length !=2){
System.err.println("Enter 'input path' and 'output path' arguments");
System.exit(0);
}
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"Second Word Count");
job.setJarByClass(SecondWordCountDriver.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(SecondWordCountMapper.class);
job.setReducerClass(SecondWordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
The output is shown below:
Note that all punctuation characters are gone, and now I has a count of 10 in line with the aim of this post