大数据 MapReduce基础实战

🕗 发布于 2024-11-29 16:40 大数据 mapreduce

一、关于此次实践

1、实战简介

MapReduce是Hadoop的核心功能之一，掌握它对学习Hadoop至关重要。Hadoop Map/Reduce是一个使用简易的软件框架，基于它写出来的应用程序能够运行在由上千个商用机器组成的大型集群上，并以一种可靠容错的方式并行处理上T级别的数据集。

本章我们来通过几个示例来学习MapReduce的用法。

2、全部任务

二、实践详解

1、第 1 关：成绩统计

命令行
touch file01
echo Hello World Bye World
cat file01
echo Hello World Bye World >file01
cat file01
touch file02
echo Hello Hadoop Goodbye Hadoop >file02
cat file02
start-dfs.sh
hadoop fs -mkdir /usr
hadoop fs -mkdir /usr/input
hadoop fs -ls /usr/output
hadoop fs -ls /
hadoop fs -ls /usr
hadoop fs -put file01 /usr/input
hadoop fs -put file02 /usr/input
hadoop fs -ls /usr/input

import java.io.IOException;
import java.util.StringTokenizer;
 
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {
    /********** Begin **********/
//Mapper函数
    public static class TokenizerMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        private int maxValue = 0;
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString(),"\n");
            while (itr.hasMoreTokens()) {
                String[] str = itr.nextToken().split(" ");
                String name = str[0];
                one.set(Integer.parseInt(str[1]));
                word.set(name);
                context.write(word,one);
            }
            //context.write(word,one);
        }
    }
    public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();
        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int maxAge = 0;
            int age = 0;
            for (IntWritable intWritable : values) {
                maxAge = Math.max(maxAge, intWritable.get());
            }
            result.set(maxAge);
            context.write(key, result);
        }
    }
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = new Job(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        String inputfile = "/user/test/input";
        String outputFile = "/user/test/output/";
        FileInputFormat.addInputPath(job, new Path(inputfile));
        FileOutputFormat.setOutputPath(job, new Path(outputFile));
        job.waitForCompletion(true);
    /********** End **********/
    }
}

2、第 2 关：文件内容合并去重

import java.io.IOException;

import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class Merge {

/**
 * @param args
 * 对A,B两个文件进行合并，并剔除其中重复的内容，得到一个新的输出文件C
 */
//在这重载map函数，直接将输入中的value复制到输出数据的key上 注意在map方法中要抛出异常：throws IOException,InterruptedException
public static class Map  extends Mapper<Object, Text, Text, Text>{

    /********** Begin **********/

        public void map(Object key, Text value, Context content) 
            throws IOException, InterruptedException {  
            Text text1 = new Text();
            Text text2 = new Text();
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                text1.set(itr.nextToken());
                text2.set(itr.nextToken());
                content.write(text1, text2);
            }
        }  
/********** End **********/
} 

//在这重载reduce函数，直接将输入中的key复制到输出数据的key上  注意在reduce方法上要抛出异常：throws IOException,InterruptedException
public static class  Reduce extends Reducer<Text, Text, Text, Text> {
    /********** Begin **********/
        
        public void reduce(Text key, Iterable<Text> values, Context context) 
            throws IOException, InterruptedException {
            Set<String> set = new TreeSet<String>();
            for(Text tex : values){
                set.add(tex.toString());
            }
            for(String tex : set){
                context.write(key, new Text(tex));
            }
        }  
    
/********** End **********/

}

public static void main(String[] args) throws Exception{

// TODO Auto-generated method stub
Configuration conf = new Configuration();
conf.set("fs.default.name","hdfs://localhost:9000");

Job job = Job.getInstance(conf,"Merge and duplicate removal");
job.setJarByClass(Merge.class);
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
String inputPath = "/user/tmp/input/";  //在这里设置输入路径
String outputPath = "/user/tmp/output/";  //在这里设置输出路径

FileInputFormat.addInputPath(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}

}

3、第 3 关：信息挖掘 - 挖掘父子关系

import java.io.IOException;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class simple_data_mining {
public static int time = 0;

/**
 * @param args
 * 输入一个child-parent的表格
 * 输出一个体现grandchild-grandparent关系的表格
 */
//Map将输入文件按照空格分割成child和parent，然后正序输出一次作为右表，反序输出一次作为左表，需要注意的是在输出的value中必须加上左右表区别标志
public static class Map extends Mapper<Object, Text, Text, Text>{
public void map(Object key, Text value, Context context) throws IOException,InterruptedException{
/********** Begin **********/
String line = value.toString();
             String[] childAndParent = line.split(" ");
             List<String> list = new ArrayList<>(2);
              for (String childOrParent : childAndParent) {
                 if (!"".equals(childOrParent)) {
                     list.add(childOrParent);
                  } 
              } 
              if (!"child".equals(list.get(0))) {
                  String childName = list.get(0);
                  String parentName = list.get(1);
                  String relationType = "1";
                  context.write(new Text(parentName), new Text(relationType + "+"
                        + childName + "+" + parentName));
                  relationType = "2";
                  context.write(new Text(childName), new Text(relationType + "+"
                        + childName + "+" + parentName));
              }
/********** End **********/
}
}

public static class Reduce extends Reducer<Text, Text, Text, Text>{
public void reduce(Text key, Iterable<Text> values,Context context) throws IOException,InterruptedException{
/********** Begin **********/

    //输出表头
          if (time == 0) {
                context.write(new Text("grand_child"), new Text("grand_parent"));
                time++;
            }

//获取value-list中value的child
List<String> grandChild = new ArrayList<>();

//获取value-list中value的parent
 List<String> grandParent = new ArrayList<>();

//左表，取出child放入grand_child
 for (Text text : values) {
                String s = text.toString();
                String[] relation = s.split("\\+");
                String relationType = relation[0];
                String childName = relation[1];
                String parentName = relation[2];
                if ("1".equals(relationType)) {
                    grandChild.add(childName);
                } else {
                    grandParent.add(parentName);
                }
            }

//右表，取出parent放入grand_parent
 int grandParentNum = grandParent.size();
               int grandChildNum = grandChild.size();
               if (grandParentNum != 0 && grandChildNum != 0) {
                for (int m = 0; m < grandChildNum; m++) {
                    for (int n = 0; n < grandParentNum; n++) {
                        //输出结果
                    context.write(new Text(grandChild.get(m)), new Text(
                                grandParent.get(n)));
                    }
                }
            }
/********** End **********/
}
}
public static void main(String[] args) throws Exception{
// TODO Auto-generated method stub
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"Single table join");
job.setJarByClass(simple_data_mining.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
String inputPath = "/user/reduce/input";   //设置输入路径
String outputPath = "/user/reduce/output";   //设置输出路径
FileInputFormat.addInputPath(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
System.exit(job.waitForCompletion(true) ? 0 : 1);

}
}

原文地址：https://blog.csdn.net/m0_74344909/article/details/144107281

免责声明：本站文章内容转载自网络资源，如本站内容侵犯了原著者的合法权益，可联系本站删除。更多内容请关注自学内容网（zxcms.com）！

上一篇：WPF的表格控件 FlexGrid设置行的高度自适应
下一篇：Spring cache注解：缓存与业务解耦实战

Transformer？Attention？——Are All You Need！
Hi，大家好，我是半亩花海。本文主要较为深入地讲述transformer 模型及attention 机制等相关深度学习的知识，主要介绍模型结构、原理等。Transformer 属于是当下比较流行和创
阅读更多2024-11-29
Sourcetree：一款强大的Git客户端
Sourcetree是一款强大的Git客户端，它提供了一个直观的图形界面，让用户能够轻松地管理他们的版本控制系统。无论是初学者还是有经验的开发者，Sourcetree都能提供方便快捷的Git操作方式，
阅读更多2024-11-29
力扣第 71 题简化路径
这道题目考察了字符串操作和栈的基本应用。在实现中，strtok和数组栈的结合使代码简单易懂。如果你对 C++ 或其他语言感兴趣，也可以尝试用 STL 或其他高级工具实现！如果你有任何问题，欢迎在评论区
阅读更多2024-11-29
数字IC后端设计实现之分段长clock tree经典案例
数字IC后端设计实现之分段长clock tree经典案例分享
阅读更多2024-11-29
网络安全视角下医疗网络渗透分析与防范策略
随着医疗信息化的快速发展，医疗网络安全问题日益凸显。本文从网络安全视角出发，深入分析了医疗内网Web渗透的常见手段，包括SQL注入攻击、XSS攻击、CSRF攻击、文件上传漏洞和DDoS攻击等，并结合实
阅读更多2024-11-29
变换矩阵左乘与右乘的区别和应用
变换矩阵左乘和右乘应用与实际意义
阅读更多2024-11-29
【微服务】消息队列与微服务之微服务详解
Spring Cloud并没有重复制造轮子，它只是将目前各家公司开发的比较成熟、经得起实际考验的服务框架组合起来，通过Spring Boot风格进行再封装屏蔽掉了复杂的配置和实现原理，最终给开发者留出
阅读更多2024-11-29
aws出现创建ec2连接报错（网络问题）
二.检查公网路由表内是否连接网关，网关是否是同一个VPC。一.检查自己ec2使用的子网是否为公网。
阅读更多2024-11-29
用go语言写一个小服务
使用go语言编写一个小服务
阅读更多2024-11-29
Linux进程基础
在用户的视角上，这些进程看似同时进行，但不是真正意义上的“同时进行”，因为只有一个处理器，在固定的时刻只有固定的进程被执行了，只不过CPU处理的速度很快，它把一个时间段划分成很短的时间片段，然后在这之
阅读更多2024-11-29