1.定位连续值的范围
create or replace view v(proj_id,proj_start,proj_end) as
select 1,date'2005-01-01',date'2005-01-02' from dual union all
select 2,date'2005-01-02',date'2005-01-03' from dual union all
select 3,date'2005-01-03',date'2005-01-04' from dual union all
select 4,date'2005-01-04',date'2005-01-05' from dual union all
select 5,date'2005-01-06',date'2005-01-07' from dual union all
select 6,date'2005-01-16',date'2005-01-17' from dual union all
select 7,date'2005-01-17',date'2005-01-18' from dual union all
select 8,date'2005-01-18',date'2005-01-19' from dual union all
select 9,date'2005-01-19',date'2005-01-20' from dual union all
select 10,date'2005-01-21',date'2005-01-22' from dual union all
select 11,date'2005-01-26',date'2005-01-27' from dual union all
select 12,date'2005-01-27',date'2005-01-28' from dual union all
select 13,date'2005-01-28',date'2005-01-29' from dual union all
select 14,date'2005-01-29',date'2005-01-30' from dual ;
需求:把连续的数据查询出来
方案1:自关联
select v1.proj_id as 工程号,v1.proj_start as 开始时间,v1.proj_end as 结束时间
from v v1,v v2 where v1.proj_start = v2.proj_end
方案2:使用lead() over() 进行过滤
select * from
(select v1.proj_id as 工程号,v1.proj_start as 开始时间,v1.proj_end as 结束时间,
lead(v1.proj_start)over(order by proj_id) as 下一期工程开始时间
from v v1 )
where 结束时间 = 下一期工程开始时间
在上面的两种写法中,自关联需要扫描两次视图“V”,而使用分析函数只需要一次就可以,根据这个特性,大部分情况下可以通过分析函数优化查询性能。
2.定位连续值范围的开始点和结束点
需求:现在要求把连续的项目合并,返回合并后的起止时间,如前四个项目合并后起止时间就是1号到5号。
如果是取最小开始时间和最大结束时间,则比较容易操作
select min(proj_start) as 开始,max(proj_end) as 结束 from v;
但是远远不能满足我们的需求。
分析:
(1)提取上一工程的结束日期
create or replace view x0 as
select proj_id as 编号,
proj_start as 开始日期,
proj_end as 结束日期,
lag(proj_end) over(order by proj_id) as 上一工程结束日期
from v
select * from x0
(2)标定工程的连续状态
create or replace view x1 as
select 编号,
开始日期,
结束日期,
上一工程结束日期,
case when 开始日期 = 上一工程结束日期 then 0 else 1 end as 连续状态
from x0;
select * from x1
可以看到,在每一个连续分组的开始位置,我们都生成了一个“1”作为标识。
(3)对这个位置状态进行累加,得到分组依据
create or replace view x2 as
select 编号,
开始日期,
结束日期,
上一工程结束日期,
连续状态,
sum(连续状态) over(order by 编号) as 分组依据
from x1;
select * from x2;
可以看到,通过提取数据(上一行日期)、生成标识、累加标识这些操作后,得到了5个连续分组,有分组依据后就容易完成下面的操作。
select 分组依据,min(开始日期) as 开始日期,max(结束日期) as 结束日期
from x2
group by 分组依据
order by 1
把上面各步骤整理在一起的语句如下:
select 分组依据,min(开始日期) as 开始日期,max(结束日期) as 结束日期
from (select 编号,
开始日期,
结束日期,
sum(连续状态) over(order by 编号) 分组依据
from (select proj_id as 编号,
proj_start as 开始日期,
proj_end as 结束日期,
case when lag(proj_end) over(order by proj_id) = proj_start then 0 else 1 end 连续状态 from v))
group by 分组依据
order by 1;
3.合并时间段
create or replace Timesheets(tast_id,start_date,end_date) as
select 1,date'1997-01-01',date'1997-01-03' from dual union all
select 2,date'1997-01-02',date'1997-01-04' from dual union all
select 3,date'1997-01-04',date'1997-01-05' from dual union all
select 4,date'1997-01-06',date'1997-01-09' from dual union all
select 5,date'1997-01-09',date'1997-01-09' from dual union all
select 6,date'1997-01-09',date'1997-01-09' from dual union all
select 7,date'1997-01-12',date'1997-01-15' from dual union all
select 8,date'1997-01-13',date'1997-01-13' from dual union all
select 9,date'1997-01-15',date'1997-01-15' from dual union all
select 10,date'1997-01-17',date'1997-01-17' from dual
select * from Timesheets
id7与id9是连续的,但中间id8和id9不连续,所以用lag取上一行来判断肯定不对。
(1)这时可以用另一个开窗方式来处理:获取当前行之前的最大“end_date”
select start_date,
end_date,
max(end_date) over(order by start_date rows between unbounded preceding and 1 preceding) as max_end_date
from timesheets b;
between unbounded preceding and 1 preceding :就是between ... and ....子句,意思是:从第一行到上一行
该分析函数就是order by start_date后“第一行到上一行”范围内的“max(end_date)”
有了这个数据后再来判断,就可以把id(7、8、9)判断为连续范围了。