Just Do IT !

Windows下的MapReduce编程实现

字数统计: 1.2k阅读时长: 6 min
2019/08/31 Share

统计某银行信用卡违约用户数量

csv下载地址

违约规则:AY_1~PAY_6:PAY_1为2005年9月的还款情况;PAY_2为2005年8月的还款情况;…;PAY_6为2005年4月的还款情况。BILL_AMT1~BILL_AMT6和PAY_AMT1~PAY_AMT6中数字标识的含义也是如此。
PAY_1~PAY_6的取值含义为:0 = 及时还;1 = 还款延迟一个月;2 = 还款延迟两个月;3 = 还款延迟三个月;…;9 = 还款延迟九个月及以上。
每月的支付金额PAY_AMT不能低于银行规定的当月最低还款额,否则就是违约。如果支付金额PAY_AMT大于上月账单金额BILL_AMT则视为及时还,剩余金额存入信用卡留做下次消费;如果支付金额小于上月账单金额但高于最低还款额则视为延迟还款。

要求:

在Hadoop平台编程实现统计银行违约用户数量

实现:

porn.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>Hadoop</groupId>
<artifactId>BankDefaulter_MapReduce</artifactId>
<version>1.0-SNAPSHOT</version>

<dependencies>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.8.0</version>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.8.0</version>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.8.0</version>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.8.0</version>
</dependency>

<dependency>
<groupId>au.com.bytecode</groupId>
<artifactId>opencsv</artifactId>
<version>2.4</version>
</dependency>



</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>bankfinddefaulter.FindDefaulter</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

FindDefaulter.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
package bankfinddefaulter;


import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FindDefaulter {

public static void main(String[] args) throws Throwable {
// TODO Auto-generated method stub
Job job = new Job();
job.setJarByClass(FindDefaulter.class);

FileInputFormat.addInputPath(job, new Path("hdfs://172.18.74.236:9000/input/UCI_Credit_Card.csv")); // csv文件所在目录
FileOutputFormat.setOutputPath(job, new Path("hdfs://172.18.74.236:9000/out")); // 设置输出文件目录

// 关联自定义的mapper和reducer
job.setMapperClass(BankMapper.class);
job.setReducerClass(BankReducer.class);

// 设置map输入数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);

// 设置最终输出数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

System.exit(job.waitForCompletion(true)?0:1);
}

}

BankReducer.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
package bankfinddefaulter;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;


public class BankReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {

int count = 0;

for (IntWritable value : values) {
count++;
}

context.write(key, new IntWritable(count));

}

}

BankMapper.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
package bankfinddefaulter;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import au.com.bytecode.opencsv.CSVParser;

public class BankMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {

if (key.get() > 0) {

String[] lines = new CSVParser().parseLine(value.toString());

//选取csv文件中第25行数据进行统计
context.write(new Text(lines[24]), new IntWritable(1));

}
}
}

方法1

在IDEA中编写好业务代码,是用mvn将程序打成jar包,上传到hdoop平台之后运行

在IDEA下的Terminal控制台中输入

mvn clean package

此命令基于pom.xml中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>bankfinddefaulter.FindDefaulter</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

Hadoop平台运行jar包

将项目代码打成jar包后,上传到hadoop平台,输入

hadoop jar Hadoop_API-1.0-SNAPSHOT-jar-with-dependencies.jar

其中1为违约用户数量, 共有6636位用户违约

方法2

在IDEA中本地运行

在Windows设置好Hadoop开发环境后, 运行FindDefaulter.java

172.18.74.236:50070/output 目录中下载查看输出

CATALOG
  1. 1. 统计某银行信用卡违约用户数量
  2. 2. 方法1
    1. 2.1. Hadoop平台运行jar包
  3. 3. 方法2