hadoop 实现简单的wordcount实例

骑猪看日落 2022-06-18 03:19 321阅读 0赞

前置条件:
在hadoop官网下载某个版本的zip文件,这里下载的版本是2.7.3,将其解压刀你的电脑的某个目录中,这里为:D:\dev\hadoop-2.7.3
下载地址:http://apache.fayea.com/hadoop/common/hadoop-2.7.3/
这里写图片描述
src的是文件源码,有需要的可以下载下来研究~
配置环境变量:
HADOOP_HOME D:\dev\hadoop-2.7.3

1.使用idea新建一个maven项目
2.修改maven项目中pom文件,加入如下依赖

  1. <dependency>
  2. <groupId>junit</groupId>
  3. <artifactId>junit</artifactId>
  4. <version>3.8.1</version>
  5. <scope>test</scope>
  6. </dependency>
  7. <dependency>
  8. <groupId>org.apache.hadoop</groupId>
  9. <artifactId>hadoop-common</artifactId>
  10. <version>2.7.3</version>
  11. </dependency>
  12. <dependency>
  13. <groupId>org.apache.hadoop</groupId>
  14. <artifactId>hadoop-hdfs</artifactId>
  15. <version>2.7.3</version>
  16. </dependency>
  17. <dependency>
  18. <groupId>org.apache.hadoop</groupId>
  19. <artifactId>hadoop-mapreduce-client-core</artifactId>
  20. <version>2.7.3</version>
  21. </dependency>
  22. <dependency>
  23. <groupId>org.apache.hadoop</groupId>
  24. <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
  25. <version>2.7.3</version>
  26. </dependency>
  27. <dependency>
  28. <groupId>org.apache.hadoop</groupId>
  29. <artifactId>hadoop-mapreduce-client-common</artifactId>
  30. <version>2.7.3</version>
  31. </dependency>
  32. <dependency>
  33. <groupId>org.apache.hadoop</groupId>
  34. <artifactId>hadoop-client</artifactId>
  35. <version>2.7.3</version>
  36. </dependency>

3.在java文件新建一个包 com.hadoop.wordcount 名字可以自定义
在包内新建一个类 wordCount
内容如下:

  1. package com.hadoop.wordcount;
  2. import org.apache.hadoop.conf.Configuration;
  3. import org.apache.hadoop.fs.Path;
  4. import org.apache.hadoop.io.IntWritable;
  5. import org.apache.hadoop.io.Text;
  6. import org.apache.hadoop.mapreduce.Job;
  7. import org.apache.hadoop.mapreduce.Mapper;
  8. import org.apache.hadoop.mapreduce.Reducer;
  9. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  10. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  11. import java.io.IOException;
  12. import java.util.StringTokenizer;
  13. /** * WordCount * * @author: wychen * @time: 2017/3/20 20:25 */
  14. public class WordCount {
  15. static class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
  16. private final static IntWritable one = new IntWritable(1);
  17. private Text word = new Text();
  18. @Override
  19. protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
  20. //分割字符串
  21. StringTokenizer itr = new StringTokenizer(value.toString());
  22. while (itr.hasMoreTokens()) {
  23. //排除字母少于5个的
  24. String tmp = itr.nextToken();
  25. if (tmp.length() < 5) {
  26. continue;
  27. }
  28. word.set(tmp);
  29. context.write(word, one);
  30. }
  31. }
  32. }
  33. static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
  34. private IntWritable result = new IntWritable();
  35. private Text keyEx = new Text();
  36. @Override
  37. protected void reduce(Text key, Iterable<IntWritable> values,
  38. Reducer<Text, IntWritable, Text, IntWritable>.Context context)
  39. throws IOException, InterruptedException {
  40. int sum = 0;
  41. for (IntWritable val : values) {
  42. //将map的结果方法,乘以2
  43. sum += val.get() + 1;
  44. }
  45. result.set(sum);
  46. keyEx.set("输出:" + key.toString());
  47. context.write(keyEx, result);
  48. }
  49. }
  50. public static void main(String[] args) throws Exception {
  51. //配置信息
  52. Configuration conf = new Configuration();
  53. //job名称
  54. Job job = Job.getInstance(conf, "mywordcount");
  55. job.setJarByClass(WordCount.class);
  56. job.setMapperClass(MyMapper.class);
  57. job.setReducerClass(MyReduce.class);
  58. job.setOutputKeyClass(Text.class);
  59. job.setOutputValueClass(IntWritable.class);
  60. //输入 输出path
  61. FileInputFormat.addInputPath(job, new Path(args[0]));
  62. FileOutputFormat.setOutputPath(job, new Path(args[1]));
  63. //结束
  64. System.exit(job.waitForCompletion(true) ? 0 : 1);
  65. }
  66. }

4.resources 文件中新建日志配置文件 log4j.properties

  1. log4j.rootLogger=DEBUG, stdout
  2. log4j.appender.stdout=org.apache.log4j.ConsoleAppender
  3. log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
  4. log4j.appender.stdout.layout.ConversionPattern=%c{
  5. 1} - %m%n
  6. log4j.logger.java.sql.PreparedStatement=DEBUG

接下来就可以直接WordCount类中运行main函数了
首先配置运行前的参数
这里写图片描述
接下来直接在WordCount类中右击鼠标,点击运行即可,可在控制台查看运行过程中输出的结果,以及你填写的文件输出路径的文件中结果
笔者的输出结果如下图所示:
这里写图片描述

好了,一个简单的MapReduce下的单词统计实例就完成了~

发表评论

表情:
评论列表 (有 0 条评论,321人围观)

还没有评论,来说两句吧...

相关阅读