一、原题描述
Table: Failed
+--------------+---------+
| Column Name | Type |
+--------------+---------+
| fail_date | date |
+--------------+---------+
Primary key for this table is fail_date.
Failed table contains the days of failed tasks.
Table: Succeeded
+--------------+---------+
| Column Name | Type |
+--------------+---------+
| success_date | date |
+--------------+---------+
Primary key for this table is success_date.
Succeeded table contains the days of succeeded tasks.
A system is running one task every day. Every task is independent of the previous tasks. The tasks can fail or succeed.
Write an SQL query to generate a report of period_state
for each continuous interval of days in the period from 2019-01-01 to 2019-12-31.
period_state
is 'failed' if tasks in this interval failed or 'succeeded' if tasks in this interval succeeded. Interval of days are retrieved as start_date
and end_date.
Order result by start_date
.
The query result format is in the following example:
Failed table: +-------------------+ | fail_date | +-------------------+ | 2018-12-28 | | 2018-12-29 | | 2019-01-04 | | 2019-01-05 | +-------------------+
Succeeded table: +-------------------+ | success_date | +-------------------+ | 2018-12-30 | | 2018-12-31 | | 2019-01-01 | | 2019-01-02 | | 2019-01-03 | | 2019-01-06 | +-------------------+
Result table: +--------------+--------------+--------------+ | period_state | start date | end date | +--------------+--------------+--------------+ | succeeded | 2019-01-01 | 2019-01-03 | | failed | 2019-01-04 | 2019-01-05 | | succeeded | 2019-01-06 | 2019-01-06 | +--------------+--------------+--------------+
The report ignored the system state in 2018 as we care about the system in the period 2019-01-01 to 2019-12-31. From 2019-01-01 to 2019-01-03 all tasks succeeded and the system state was "succeeded". From 2019-01-04 to 2019-01-05 all tasks failed and system state was "failed". From 2019-01-06 to 2019-01-06 all tasks succeeded and system state was "succeeded".
二、简要翻译
两张表, `Failed 和 Succeeded,用来记录一个每日定时跑的系统任务的失败和成功。要求返回一张结果表,按顺序展示该任务失败和成功的连续时间段以及起止时间。
**三、SQL分析
**0、建表语句
`
CREATE TABLE IF NOT EXISTS `Failed`(
`fail_date` DATE,
PRIMARY KEY ( `fail_date` )
)ENGINE=INNODB DEFAULT CHARSET=utf8;
CREATE TABLE IF NOT EXISTS `Succeeded`(
`success_date` DATE,
PRIMARY KEY ( `success_date` )
)ENGINE=INNODB DEFAULT CHARSET=utf8;
INSERT INTO Succeeded(success_date) VALUES('2018-12-30'),('2018-12-31'),('2019-01-01'),('2019-01-02'),('2019-01-03'),('2019-01-06');
INSERT INTO Failed(fail_date) VALUES('2018-12-28'),('2018-12-29'),('2019-01-04'),('2019-01-05'),('2019-01-07');
1、将两张表里面的数据按照时间是否连续进行分组。
1 #失败表
2 SELECT
3 fail_date,
4 IF(
5 @date = DATE_SUB(fail_date, INTERVAL 1 DAY),
6 @rank := @rank,
7 @rank := @rank + 1
8 ) AS rank,
9 @date := fail_date
10 FROM
11 Failed,
12 (SELECT
13 @date := NULL,
14 @rank := 0) r
15 WHERE fail_date > '2018-12-31'
16 AND fail_date < '2020-01-01'
17 ORDER BY fail_date;
18
19 #成功表
20 SELECT
21 success_date,
22 IF(
23 @date = DATE_SUB(success_date, INTERVAL 1 DAY),
24 @rank := @rank,
25 @rank := @rank + 1
26 ) AS rank,
27 @date := success_date
28 FROM
29 Succeeded,
30 (SELECT
31 @date := NULL,
32 @rank := 0) r
33 WHERE success_date > '2018-12-31'
34 AND success_date < '2020-01-01'
35 ORDER BY success_date
2、将上一步的两张表分组查询最大日期,最小日期,然后合并再排序
1 SELECT
2 *
3 FROM
4 (SELECT
5 'succeeded' AS period_state,
6 MIN(success_date) AS start_date,
7 MAX(success_date) AS end_date
8 FROM
9 (SELECT
10 success_date,
11 IF(
12 @date = DATE_SUB(success_date, INTERVAL 1 DAY),
13 @rank := @rank,
14 @rank := @rank + 1
15 ) AS rank,
16 @date := success_date
17 FROM
18 Succeeded,
19 (SELECT
20 @date := NULL,
21 @rank := 0) r
22 WHERE success_date > '2018-12-31'
23 AND success_date < '2020-01-01'
24 ORDER BY success_date) t1
25 GROUP BY t1.rank
26 UNION
27 ALL
28 SELECT
29 'failed' AS period_state,
30 MIN(fail_date) AS start_date,
31 MAX(fail_date) AS end_date
32 FROM
33 (SELECT
34 fail_date,
35 IF(
36 @date = DATE_SUB(fail_date, INTERVAL 1 DAY),
37 @rank := @rank,
38 @rank := @rank + 1
39 ) AS rank,
40 @date := fail_date
41 FROM
42 Failed,
43 (SELECT
44 @date := NULL,
45 @rank := 0) r
46 WHERE fail_date > '2018-12-31'
47 AND fail_date < '2020-01-01'
48 ORDER BY fail_date) t2
49 GROUP BY t2.rank) temp
50 ORDER BY start_date ;
`**四、HIVE版本
0.1 建表语句 略
0.2、xml依赖
**`
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
</dependency>
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>4.7.2</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>RELEASE</version>
<scope>compile</scope>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.8</version>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
1、用自定义的UDTF代替三里面的步骤1。
1 package com.hive.udf;
2
3 import java.text.ParseException;
4 import java.text.SimpleDateFormat;
5 import java.util.ArrayList;
6 import java.util.Calendar;
7 import java.util.Date;
8 import java.util.GregorianCalendar;
9
10 import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
11 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
12 import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
13 import org.apache.hadoop.hive.ql.metadata.HiveException;
14 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
15 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
16 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
17 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
18 import org.junit.jupiter.api.Test;
19
20 public class TestGenericUDTF extends GenericUDTF {
21 private Date startDate = null;
22 private Date lastDate = null;
23 private Date lastEmitDate = null;
24 private static SimpleDateFormat s = new SimpleDateFormat("yyyy-MM-dd");
25
26 @Override
27 public void close() throws HiveException {
28 if (startDate != null && lastDate != null && lastEmitDate != startDate) {
29 String[] result = new String[]{s.format(startDate), s.format(lastDate)};
30 forward(result);
31 }
32 }
33
34 @Override
35 public StructObjectInspector initialize(ObjectInspector[] args)
36 throws UDFArgumentException {
37 if (args.length != 1) {
38 throw new UDFArgumentLengthException("TestGenericUDTF takes only one argument");
39 }
40 if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
41 throw new UDFArgumentException("TestGenericUDTF takes string as a parameter");
42 }
43
44 ArrayList<String> fieldNames = new ArrayList<String>();
45 ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
46 fieldNames.add("start_date");
47 fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
48 fieldNames.add("end_date");
49 fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
50
51 return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
52 }
53
54 @Override
55 public void process(Object[] args) throws HiveException {
56 String dateStr = args[0].toString();
57 Date date = null;
58 try {
59 date = s.parse(dateStr);
60 } catch (ParseException e) {
61 e.printStackTrace();
62 }
63 if (date == null) {
64 if (startDate != null && lastDate != null && lastEmitDate != startDate) {
65 String[] result = new String[]{startDate.toString(), lastDate.toString()};
66 forward(result);
67 lastEmitDate = startDate;
68 }
69 startDate = null;
70 lastDate = null;
71 return;
72 }
73 if (startDate == null) {
74 startDate = date;
75 } else if (lastDate != null) {
76 GregorianCalendar calander = new GregorianCalendar();
77 calander.setTime(lastDate);
78 calander.add(Calendar.DATE, 1);
79 Date time = calander.getTime();
80 if (!time.equals(date)) {
81 String[] result = new String[]{s.format(startDate), s.format(lastDate)};
82 forward(result);
83 lastEmitDate = startDate;
84 startDate = date;
85 }
86 }
87 lastDate = date;
88 }
89 }
2、打肥jar包上传服务器,使用该udtf。
add JAR /home/hadoop/hive-dateudtf-jar-with-dependencies.jar;
create temporary function dateUDTF as 'com.hive.udf.TestGenericUDTF';