java hadoop wordcount

素颜马尾好姑娘i 2022-05-17 11:16 348阅读 0赞

是什么?

  1. hadoop 入门栗子,跑一个单词统计功能。

为什么?

  1. 理解一个MR,后面都是小事。万事开头难,一旦上了路接下来的借简单了。

怎么做?

一、先从网上找个Demo:https://www.programcreek.com/java-api-examples/?code=Nextzero/hadoop-2.6.0-cdh5.4.3/hadoop-2.6.0-cdh5.4.3-master/hadoop-mapreduce1-project/src/test/org/apache/hadoop/mapreduce/TestMapReduceLocal.java#

代码如下:

  1. package com.geotmt.hadoop.hdfs;
  2. import java.io.IOException;
  3. import java.util.StringTokenizer;
  4. import org.apache.hadoop.conf.Configuration;
  5. import org.apache.hadoop.fs.Path;
  6. import org.apache.hadoop.io.IntWritable;
  7. import org.apache.hadoop.io.Text;
  8. import org.apache.hadoop.mapreduce.Job;
  9. import org.apache.hadoop.mapreduce.Mapper;
  10. import org.apache.hadoop.mapreduce.Reducer;
  11. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  12. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  13. import org.apache.hadoop.util.GenericOptionsParser;
  14. /**
  15. * 单词计数
  16. *
  17. * Created by chao,zhao on 2018/8/9. */
  18. public class FirstMapReduceJob {
  19. public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{
  20. private final static IntWritable one = new IntWritable(1);
  21. private Text word = new Text();
  22. public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
  23. StringTokenizer itr = new StringTokenizer(value.toString());
  24. while (itr.hasMoreTokens()) {
  25. word.set(itr.nextToken());
  26. context.write(word, one);
  27. }
  28. }
  29. }
  30. public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
  31. private IntWritable result = new IntWritable();
  32. public void reduce(Text key, Iterable<IntWritable> values,Context context ) throws IOException, InterruptedException {
  33. int sum = 0;
  34. for (IntWritable val : values) {
  35. sum += val.get();
  36. }
  37. result.set(sum);
  38. context.write(key, result);
  39. }
  40. }
  41. public static void main(String[] args) throws Exception {
  42. Configuration conf = new Configuration();
  43. String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
  44. if (otherArgs.length != 2) {
  45. System.err.println("Usage: wordcount <in> <out>");
  46. System.exit(2);
  47. }
  48. Job job = new Job(conf, "word count");
  49. job.setJarByClass(FirstMapReduceJob.class);
  50. job.setMapperClass(TokenizerMapper.class);
  51. job.setCombinerClass(IntSumReducer.class);
  52. job.setReducerClass(IntSumReducer.class);
  53. job.setOutputKeyClass(Text.class);
  54. job.setOutputValueClass(IntWritable.class);
  55. FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
  56. FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
  57. System.exit(job.waitForCompletion(true) ? 0 : 1);
  58. }
  59. }

二、打包把依赖全部打进去,当然还有种方案是放在一个目录然后配置环境变量,个人觉得不可取,问题不断。

  1. <?xml version="1.0" encoding="UTF-8"?>
  2. <project xmlns="http://maven.apache.org/POM/4.0.0"
  3. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4. xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5. <modelVersion>4.0.0</modelVersion>
  6. <groupId>myHadoop</groupId>
  7. <artifactId>myHadoop</artifactId>
  8. <version>1.0-SNAPSHOT</version>
  9. <properties>
  10. <hadoop.hdfs.version>2.4.0</hadoop.hdfs.version>
  11. <hadoop.mapreduce.client.core.version>2.4.0</hadoop.mapreduce.client.core.version>
  12. <hadoop.common.version>2.4.0</hadoop.common.version>
  13. <hadoop.mapreduce.client.common.version>2.4.0</hadoop.mapreduce.client.common.version>
  14. <hadoop.mapreduce.client.jobclient.version>2.4.0</hadoop.mapreduce.client.jobclient.version>
  15. <commons.lang.version>3.3.2</commons.lang.version>
  16. <commons.io.version>2.4</commons.io.version>
  17. </properties>
  18. <dependencies>
  19. <dependency>
  20. <groupId>org.apache.hadoop</groupId>
  21. <artifactId>hadoop-hdfs</artifactId>
  22. <version>${hadoop.hdfs.version}</version>
  23. </dependency>
  24. <dependency>
  25. <groupId>org.apache.hadoop</groupId>
  26. <artifactId>hadoop-mapreduce-client-core</artifactId>
  27. <version>${hadoop.mapreduce.client.core.version}</version>
  28. </dependency>
  29. <dependency>
  30. <groupId>org.apache.hadoop</groupId>
  31. <artifactId>hadoop-common</artifactId>
  32. <version>${hadoop.common.version}</version>
  33. </dependency>
  34. <dependency>
  35. <groupId>org.apache.hadoop</groupId>
  36. <artifactId>hadoop-mapreduce-client-common</artifactId>
  37. <version>${hadoop.mapreduce.client.common.version}</version>
  38. </dependency>
  39. <dependency>
  40. <groupId>org.apache.hadoop</groupId>
  41. <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
  42. <version>${hadoop.mapreduce.client.jobclient.version}</version>
  43. </dependency>
  44. <dependency>
  45. <groupId>org.apache.commons</groupId>
  46. <artifactId>commons-lang3</artifactId>
  47. <version>${commons.lang.version}</version>
  48. </dependency>
  49. <dependency>
  50. <groupId>commons-io</groupId>
  51. <artifactId>commons-io</artifactId>
  52. <version>${commons.io.version}</version>
  53. </dependency>
  54. </dependencies>
  55. <build>
  56. <plugins>
  57. <plugin>
  58. <artifactId>maven-assembly-plugin</artifactId>
  59. <configuration>
  60. <appendAssemblyId>false</appendAssemblyId>
  61. <descriptorRefs>
  62. <descriptorRef>jar-with-dependencies</descriptorRef>
  63. </descriptorRefs>
  64. <archive>
  65. <manifest>
  66. <!-- 此处指定main方法入口的class -->
  67. <mainClass>com.geotmt.hadoop.hdfs.FirstMapReduceJob</mainClass>
  68. </manifest>
  69. </archive>
  70. </configuration>
  71. <executions>
  72. <execution>
  73. <id>make-assembly</id>
  74. <phase>package</phase>
  75. <goals>
  76. <goal>assembly</goal>
  77. </goals>
  78. </execution>
  79. </executions>
  80. </plugin>
  81. <plugin>
  82. <groupId>org.apache.maven.plugins</groupId>
  83. <artifactId>maven-compiler-plugin</artifactId>
  84. <configuration>
  85. <source>1.7</source>
  86. <target>1.7</target>
  87. </configuration>
  88. </plugin>
  89. </plugins>
  90. </build>
  91. </project>

三、上传集群,记住用户和权限一致,执行命令:

  1. hadoop jar myHadoop-1.0-SNAPSHOT.jar /user/zhaochao/20180727/sms.1532681090912.log.tmp /user/zhaochao/20180727/mr_result.txt

四、查看结果:

  1. hadoop fs -cat /user/zhaochao/20180727/mr_result/part-r-00000

发表评论

表情:
评论列表 (有 0 条评论,348人围观)

还没有评论,来说两句吧...

相关阅读