Skip navigation.

Randolf Geist

Syndicate content
Updated: 8 hours 30 min ago

Parallel Execution Skew – Demonstrating Skew

Mon, 2014-04-14 12:42
This is just a short notice that the next part of the mini-series "Parallel Execution Skew" is published at AllThingsOracle.com

Analysing Parallel Execution Skew - Without Diagnostics / Tuning Pack License

Sun, 2014-04-06 14:27
This is the third part of the video tutorial "Analysing Parallel Execution Skew". In this part I show how to analyse a parallel SQL execution regarding Parallel Execution Skew.

If you don't have a Diagnostics / Tuning Pack license the options you have for doing that are quite limited, and the approach, as demonstrated in the tutorial, has several limitations and shortcomings.

Here is the video:



If you want to reproduce or play around with the examples shown in the tutorial here is the script for creating the tables and running the queries / DML commands used in the tutorial. A shout goes out to Christo Kutrovsky at Pythian who I think was the one who inspired the beautified version on V$PQ_TQSTAT.

---------------------
-- Links for S-ASH --
---------------------
--
-- http://www.perfvision.com/ash.php
-- http://www.pythian.com/blog/trying-out-s-ash/
-- http://sourceforge.net/projects/orasash/files/v2.3/
-- http://sourceforge.net/projects/ashv/
---------------------

-- Table creation
set echo on timing on time on

drop table t_1;

purge table t_1;

drop table t_2;

purge table t_2;

drop table t_1_part;

purge table t_1_part;

drop table t_2_part;

purge table t_2_part;

drop table t1;

purge table t1;

drop table t2;

purge table t2;

drop table t3;

purge table t3;

drop table t4;

purge table t4;

drop table t5;

purge table t5;

drop table x;

purge table x;

create table t1
as
select /*+ use_nl(a b) */
(rownum * 2) as id
, rownum as id2
, rpad('x', 100) as filler
from
(select /*+ cardinality(1000) */ * from dual
connect by
level <= 1000) a, (select /*+ cardinality(2) */ * from dual connect by level <= 2) b
;

exec dbms_stats.gather_table_stats(null, 't1')

alter table t1 cache;

create table t2
compress
as
select
(rownum * 2) + 1 as id
, mod(rownum, 2000) + 1 as id2
, rpad('x', 100) as filler
from
(select /*+ cardinality(1000000) */ * from dual
connect by
level <= 1000000) a, (select /*+ cardinality(2) */ * from dual connect by level <= 2) b
;

exec dbms_stats.gather_table_stats(null, 't2')

alter table t2 cache;

create table t3
as
select /*+ use_nl(a b) */
(rownum * 2) as id
, rownum as id2
, rpad('x', 100) as filler
from
(select /*+ cardinality(1000) */ * from dual
connect by
level <= 1000) a, (select /*+ cardinality(2) */ * from dual connect by level <= 2) b
;

exec dbms_stats.gather_table_stats(null, 't3')

alter table t3 cache;

create table t4
compress
as
select
(rownum * 2) + 1 as id
, mod(rownum, 2000) + 1 as id2
, rpad('x', 100) as filler
from
(select /*+ cardinality(1000000) */ * from dual
connect by
level <= 1000000) a, (select /*+ cardinality(2) */ * from dual connect by level <= 2) b
;

exec dbms_stats.gather_table_stats(null, 't4')

alter table t4 cache;

create table t5
as
select /*+ use_nl(a b) */
(rownum * 2) as id
, rownum as id2
, rpad('x', 100) as filler
from
(select /*+ cardinality(1000) */ * from dual
connect by
level <= 1000) a, (select /*+ cardinality(2) */ * from dual connect by level <= 2) b
;

exec dbms_stats.gather_table_stats(null, 't5')

alter table t5 cache;

create table x
compress
as
select * from t2
where 1 = 2;

create unique index x_idx1 on x (id);

alter table t1 parallel 2;

alter table t2 parallel 2;

alter table t3 parallel 15;

alter table t4 parallel 15;

alter table t5 parallel 15;

create table t_1
compress
as
select /*+ use_nl(a b) */
rownum as id
, rpad('x', 100) as filler
from
(select /*+ cardinality(1e5) */ * from dual
connect by
level <= 1e5) a, (select /*+ cardinality(20) */ * from dual connect by level <= 20) b
;

exec dbms_stats.gather_table_stats(null, 't_1')

create table t_2
compress
as
select
rownum as id
, case when rownum <= 5e5 then mod(rownum, 2e6) + 1 else 1 end as fk_id_skew
, rpad('x', 100) as filler
from
(select /*+ cardinality(1e5) */ * from dual
connect by
level <= 1e5) a, (select /*+ cardinality(20) */ * from dual connect by level <= 20) b
;

exec dbms_stats.gather_table_stats(null, 't_2', method_opt=>'for all columns size 1', no_invalidate=>false)

alter table t_1 parallel 8 cache;

alter table t_2 parallel 8 cache;

create table t_1_part
partition by hash(id) partitions 8
compress
as
select /*+ use_nl(a b) */
rownum as id
, rpad('x', 100) as filler
from
(select /*+ cardinality(1e5) */ * from dual
connect by
level <= 1e5) a, (select /*+ cardinality(20) */ * from dual connect by level <= 20) b
;

exec dbms_stats.gather_table_stats(null, 't_1_part')

create table t_2_part
partition by hash(fk_id_skew) partitions 8
compress
as
select
rownum as id
, case when rownum <= 5e5 then mod(rownum, 2e6) + 1 else 1 end as fk_id_skew
, rpad('x', 100) as filler
from
(select /*+ cardinality(1e5) */ * from dual
connect by
level <= 1e5) a, (select /*+ cardinality(20) */ * from dual connect by level <= 20) b
;

exec dbms_stats.gather_table_stats(null, 't_2_part', method_opt=>'for all columns size 1', no_invalidate=>false)

alter table t_1_part parallel 8 cache;

alter table t_2_part parallel 8 cache;

---------------------------------------------------------------
-- Single DFO tree (with Parallel Execution Skew), many DFOs --
---------------------------------------------------------------

set echo on timing on time on verify on

define num_cpu = "14"

alter session set workarea_size_policy = manual;

alter session set sort_area_size = 200000000;

alter session set sort_area_size = 200000000;

alter session set hash_area_size = 200000000;

alter session set hash_area_size = 200000000;

select
max(t1_id)
, max(t1_filler)
, max(t2_id)
, max(t2_filler)
, max(t3_id)
, max(t3_filler)
from (
select /*+ monitor
no_merge
no_merge(v_1)
no_merge(v_5)
parallel(t1 &num_cpu)
PQ_DISTRIBUTE(T1 HASH HASH)
PQ_DISTRIBUTE(V_5 HASH HASH)
leading (v_1 v_5 t1)
use_hash(v_1 v_5 t1)
swap_join_inputs(t1)
*/
t1.id as t1_id
, regexp_replace(v_5.t3_filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') as t1_filler
, v_5.*
from (
select /*+ parallel(t2 &num_cpu)
parallel(t3 &num_cpu)
leading(t3 t2)
use_hash(t3 t2)
swap_join_inputs(t2)
PQ_DISTRIBUTE(T2 HASH HASH)
*/
t2.id as t2_id
, t2.filler as t2_filler
, t2.id2 as t2_id2
, t3.id as t3_id
, t3.filler as t3_filler
from
t1 t2
, t2 t3
where
t3.id2 = t2.id2 (+)
and regexp_replace(t3.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler (+), '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
and mod(t3.id2, 3) = 0
) v_1
, (
select /*+ parallel(t2 &num_cpu)
parallel(t3 &num_cpu)
leading(t3 t2)
use_hash(t3 t2)
swap_join_inputs(t2)
PQ_DISTRIBUTE(T2 HASH HASH)
*/
t2.id as t2_id
, t2.filler as t2_filler
, t2.id2 as t2_id2
, t3.id as t3_id
, t3.filler as t3_filler
from
t1 t2
, t2 t3
where
t3.id = t2.id (+)
and regexp_replace(t3.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler (+), '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
and mod(t3.id2, 3) = 0
) v_5
, t1
where
v_1.t3_id = v_5.t3_id
and v_5.t2_id2 = t1.id2 (+) + 2001
and regexp_replace(v_5.t3_filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t1.filler (+), '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
)
;

break on dfo_number nodup on tq_id nodup on server_type skip 1 nodup on instance nodup

-- compute sum label Total of num_rows on server_type

select
/*dfo_number
, */tq_id
, cast(server_type as varchar2(10)) as server_type
, instance
, cast(process as varchar2(8)) as process
, num_rows
, round(ratio_to_report(num_rows) over (partition by dfo_number, tq_id, server_type) * 100) as "%"
, cast(rpad('#', round(num_rows * 10 / nullif(max(num_rows) over (partition by dfo_number, tq_id, server_type), 0)), '#') as varchar2(10)) as graph
, round(bytes / 1024 / 1024) as mb
, round(bytes / nullif(num_rows, 0)) as "bytes/row"
from
v$pq_tqstat
order by
dfo_number
, tq_id
, server_type desc
, instance
, process
;

---------------------------------------------------------------------------------------------------
-- Same statement with Parallel TEMP TABLE TRANSFORMATION, V$PQ_TQSTAT shows useless information --
---------------------------------------------------------------------------------------------------

set echo on timing on time on verify on

define num_cpu = "14"

alter session set workarea_size_policy = manual;

alter session set sort_area_size = 200000000;

alter session set sort_area_size = 200000000;

alter session set hash_area_size = 200000000;

alter session set hash_area_size = 200000000;

with result as
(
select /*+ materialize
monitor
no_merge
no_merge(v_1)
no_merge(v_5)
parallel(t1 &num_cpu)
PQ_DISTRIBUTE(T1 HASH HASH)
PQ_DISTRIBUTE(V_1 HASH HASH)
PQ_DISTRIBUTE(V_5 HASH HASH)
leading (v_1 v_5 t1)
use_hash(v_1 v_5 t1)
swap_join_inputs(t1)
*/
t1.id as t1_id
, regexp_replace(v_5.t3_filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') as t1_filler
, v_5.*
from (
select /*+ parallel(t2 &num_cpu) parallel(t3 &num_cpu) leading(t3 t2) use_hash(t3 t2) swap_join_inputs(t2) PQ_DISTRIBUTE(T2 HASH HASH) */
t2.id as t2_id
, t2.filler as t2_filler
, t2.id2 as t2_id2
, t3.id as t3_id
, t3.filler as t3_filler
from
t1 t2
, t2 t3
where
t3.id2 = t2.id2 (+)
and regexp_replace(t3.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler (+), '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
and mod(t3.id2, 3) = 0
)
v_1
, (
select /*+ parallel(t2 &num_cpu) parallel(t3 &num_cpu) leading(t3 t2) use_hash(t3 t2) swap_join_inputs(t2) PQ_DISTRIBUTE(T2 HASH HASH) */
t2.id as t2_id
, t2.filler as t2_filler
, t2.id2 as t2_id2
, t3.id as t3_id
, t3.filler as t3_filler
from
t1 t2
, t2 t3
where
t3.id = t2.id (+)
and regexp_replace(t3.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler (+), '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
and mod(t3.id2, 3) = 0
) v_5
, t1
where
v_1.t3_id = v_5.t3_id
and v_5.t2_id2 = t1.id2 (+) + 2001
and regexp_replace(v_5.t3_filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t1.filler (+), '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
)
select max(t1_id), max(t1_filler), max(t2_id), max(t2_filler), max(t3_id), max(t3_filler) from
result;

break on dfo_number nodup on tq_id nodup on server_type skip 1 nodup on instance nodup

-- compute sum label Total of num_rows on server_type

select
/*dfo_number
, */tq_id
, cast(server_type as varchar2(10)) as server_type
, instance
, cast(process as varchar2(8)) as process
, num_rows
, round(ratio_to_report(num_rows) over (partition by dfo_number, tq_id, server_type) * 100) as "%"
, cast(rpad('#', round(num_rows * 10 / nullif(max(num_rows) over (partition by dfo_number, tq_id, server_type), 0)), '#') as varchar2(10)) as graph
, round(bytes / 1024 / 1024) as mb
, round(bytes / nullif(num_rows, 0)) as "bytes/row"
from
v$pq_tqstat
order by
dfo_number
, tq_id
, server_type desc
, instance
, process
;

--------------------------------------------------------------------------------------------------
-- This construct results in misleading information from V$PQ_TQSTAT (actually a complete mess) --
--------------------------------------------------------------------------------------------------

set echo on timing on time on

alter session enable parallel dml;

truncate table x;

insert /*+ append parallel(x 4) */ into x
select /*+ leading(v1 v2) optimizer_features_enable('11.2.0.1') */
v_1.id
, v_1.id2
, v_1.filler
from (
select
id
, id2
, filler
from (
select /*+ parallel(t2 4) no_merge */
rownum as id
, t2.id2
, t2.filler
from
t2
where
mod(t2.id2, 3) = 0
and regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'i')
) v1
) v_1
, (
select
id
, id2
, filler
from (
select /*+ parallel(t2 8) no_merge */
rownum as id
, t2.id2
, t2.filler
from
t2
where
mod(t2.id2, 3) = 0
and regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'i')
) v2
) v_2
where
v_1.id = v_2.id
and v_1.filler = v_2.filler
;

-- Parallel DML requires a COMMIT before querying V$PQ_TQSTAT
commit;

break on dfo_number nodup on tq_id nodup on server_type skip 1 nodup on instance nodup

compute sum label Total of num_rows on server_type

select
dfo_number
, tq_id
, cast(server_type as varchar2(10)) as server_type
, instance
, cast(process as varchar2(8)) as process
, num_rows
, round(ratio_to_report(num_rows) over (partition by dfo_number, tq_id, server_type) * 100) as "%"
, cast(rpad('#', round(num_rows * 10 / nullif(max(num_rows) over (partition by dfo_number, tq_id, server_type), 0)), '#') as varchar2(10)) as graph
, round(bytes / 1024 / 1024) as mb
, round(bytes / nullif(num_rows, 0)) as "bytes/row"
from
v$pq_tqstat
order by
dfo_number
, tq_id
, server_type desc
, instance
, process
;

----------------------------------------------------------------------
-- Single DFO tree (with Parallel Execution Skew, almost no impact) --
----------------------------------------------------------------------

set echo on timing on time on

alter session set workarea_size_policy = manual;

alter session set sort_area_size = 500000000;

alter session set sort_area_size = 500000000;

alter session set hash_area_size = 500000000;

alter session set hash_area_size = 500000000;

select /*+ leading(v1)
use_hash(t_1)
no_swap_join_inputs(t_1)
pq_distribute(t_1 hash hash)
*/
max(t_1.filler)
, max(v1.t_1_filler)
, max(v1.t_2_filler)
from
t_1
, (
select /*+ no_merge
leading(t_1 t_2)
use_hash(t_2)
no_swap_join_inputs(t_2)
pq_distribute(t_2 hash hash) */
t_1.id as t_1_id
, t_1.filler as t_1_filler
, t_2.id as t_2_id
, t_2.filler as t_2_filler
from t_1
, t_2
where
t_2.fk_id_skew = t_1.id
) v1
where
v1.t_2_id = t_1.id
and regexp_replace(v1.t_2_filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') >= regexp_replace(t_1.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
and regexp_replace(v1.t_2_filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'i') >= regexp_replace(t_1.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'i')
;

break on dfo_number nodup on tq_id nodup on server_type skip 1 nodup on instance nodup

-- compute sum label Total of num_rows on server_type

select
/*dfo_number
, */tq_id
, cast(server_type as varchar2(10)) as server_type
, instance
, cast(process as varchar2(8)) as process
, num_rows
, round(ratio_to_report(num_rows) over (partition by dfo_number, tq_id, server_type) * 100) as "%"
, cast(rpad('#', round(num_rows * 10 / nullif(max(num_rows) over (partition by dfo_number, tq_id, server_type), 0)), '#') as varchar2(10)) as graph
, round(bytes / 1024 / 1024) as mb
, round(bytes / nullif(num_rows, 0)) as "bytes/row"
from
v$pq_tqstat
order by
dfo_number
, tq_id
, server_type desc
, instance
, process
;

--------------------------------------------------------------------------------------------------------------------------------
-- Full Partition Wise Join with partition skew - V$PQ_TQSTAT is of no help, since no redistribution takes place (single DFO) --
--------------------------------------------------------------------------------------------------------------------------------

set echo on timing on time on

alter session set workarea_size_policy = manual;

alter session set sort_area_size = 500000000;

alter session set sort_area_size = 500000000;

alter session set hash_area_size = 500000000;

alter session set hash_area_size = 500000000;

select count(t_2_filler) from (
select /*+ monitor
leading(t_1 t_2)
use_hash(t_2)
no_swap_join_inputs(t_2)
pq_distribute(t_2 none none)
*/
t_1.id as t_1_id
, t_1.filler as t_1_filler
, t_2.id as t_2_id
, t_2.filler as t_2_filler
from t_1_part t_1
, t_2_part t_2
where
t_2.fk_id_skew = t_1.id
and regexp_replace(t_2.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') >= regexp_replace(t_1.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
);

break on dfo_number nodup on tq_id nodup on server_type skip 1 nodup on instance nodup

-- compute sum label Total of num_rows on server_type

select
/*dfo_number
, */tq_id
, cast(server_type as varchar2(10)) as server_type
, instance
, cast(process as varchar2(8)) as process
, num_rows
, round(ratio_to_report(num_rows) over (partition by dfo_number, tq_id, server_type) * 100) as "%"
, cast(rpad('#', round(num_rows * 10 / nullif(max(num_rows) over (partition by dfo_number, tq_id, server_type), 0)), '#') as varchar2(10)) as graph
, round(bytes / 1024 / 1024) as mb
, round(bytes / nullif(num_rows, 0)) as "bytes/row"
from
v$pq_tqstat
order by
dfo_number
, tq_id
, server_type desc
, instance
, process
;

Analysing Parallel Execution Skew - Data Flow Operations (DFOs) And DFO Trees

Sat, 2014-04-05 03:24
This is the second part of the video tutorial "Analysing Parallel Execution Skew". In this part I introduce the concept of "Data Flow Operations (DFOs)" and "DFO Trees", which is what a Parallel Execution plan is made of. DFOs / DFO Trees are specific to Parallel Execution and don't have any counterpart in a serial execution plan.

Understanding the implications of DFOs / DFO Trees is important as prerequisite for understanding some of the effects shown in the later parts of the video tutorial, hence I covered this as a separate topic.

Note that this tutorial also demonstrates some new 12c features regarding Parallel Execution, in particular how Oracle 12c now lifts many of the previous limitations that lead to the generation of multiple DFO Trees.

Here is the video:



If you want to reproduce and play around with the DFO Tree variations shown in the tutorial here is the script for creating the tables and running the queries / DML commands used in the tutorial:

-- Table creation
set echo on timing on time on

drop table t1;

purge table t1;

drop table t2;

purge table t2;

drop table t3;

purge table t3;

drop table t4;

purge table t4;

drop table t5;

purge table t5;

drop table x;

purge table x;

create table t1
as
select /*+ use_nl(a b) */
(rownum * 2) as id
, rownum as id2
, rpad('x', 100) as filler
from
(select /*+ cardinality(1000) */ * from dual
connect by
level <= 1000) a, (select /*+ cardinality(2) */ * from dual connect by level <= 2) b
;

exec dbms_stats.gather_table_stats(null, 't1')

alter table t1 cache;

create table t2
compress
as
select
(rownum * 2) + 1 as id
, mod(rownum, 2000) + 1 as id2
, rpad('x', 100) as filler
from
(select /*+ cardinality(1000000) */ * from dual
connect by
level <= 1000000) a, (select /*+ cardinality(2) */ * from dual connect by level <= 2) b
;

exec dbms_stats.gather_table_stats(null, 't2')

alter table t2 cache;

create table t3
as
select /*+ use_nl(a b) */
(rownum * 2) as id
, rownum as id2
, rpad('x', 100) as filler
from
(select /*+ cardinality(1000) */ * from dual
connect by
level <= 1000) a, (select /*+ cardinality(2) */ * from dual connect by level <= 2) b
;

exec dbms_stats.gather_table_stats(null, 't3')

alter table t3 cache;

create table t4
compress
as
select
(rownum * 2) + 1 as id
, mod(rownum, 2000) + 1 as id2
, rpad('x', 100) as filler
from
(select /*+ cardinality(1000000) */ * from dual
connect by
level <= 1000000) a, (select /*+ cardinality(2) */ * from dual connect by level <= 2) b
;

exec dbms_stats.gather_table_stats(null, 't4')

alter table t4 cache;

create table t5
as
select /*+ use_nl(a b) */
(rownum * 2) as id
, rownum as id2
, rpad('x', 100) as filler
from
(select /*+ cardinality(1000) */ * from dual
connect by
level <= 1000) a, (select /*+ cardinality(2) */ * from dual connect by level <= 2) b
;

exec dbms_stats.gather_table_stats(null, 't5')

alter table t5 cache;

create table x
compress
as
select * from t2
where 1 = 2;

create unique index x_idx1 on x (id);

alter table t1 parallel 2;

alter table t2 parallel 2;

alter table t3 parallel 15;

alter table t4 parallel 15;

alter table t5 parallel 15;

---------------------------------------------------------------
-- Single DFO tree (with Parallel Execution Skew), many DFOs --
---------------------------------------------------------------

set echo on timing on time on verify on

define num_cpu = "15"

select
max(t1_id)
, max(t1_filler)
, max(t2_id)
, max(t2_filler)
, max(t3_id)
, max(t3_filler)
from (
select /*+ monitor
no_merge
no_merge(v_1)
no_merge(v_5)
parallel(t1 &num_cpu)
PQ_DISTRIBUTE(T1 HASH HASH)
PQ_DISTRIBUTE(V_5 HASH HASH)
leading (v_1 v_5 t1)
use_hash(v_1 v_5 t1)
swap_join_inputs(t1)
*/
t1.id as t1_id
, regexp_replace(v_5.t3_filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') as t1_filler
, v_5.*
from (
select /*+ parallel(t2 &num_cpu)
parallel(t3 &num_cpu)
leading(t3 t2)
use_hash(t3 t2)
swap_join_inputs(t2)
PQ_DISTRIBUTE(T2 HASH HASH)
*/
t2.id as t2_id
, t2.filler as t2_filler
, t2.id2 as t2_id2
, t3.id as t3_id
, t3.filler as t3_filler
from
t1 t2
, t2 t3
where
t3.id2 = t2.id2 (+)
and regexp_replace(t3.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler (+), '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
and mod(t3.id2, 3) = 0
) v_1
, (
select /*+ parallel(t2 &num_cpu)
parallel(t3 &num_cpu)
leading(t3 t2)
use_hash(t3 t2)
swap_join_inputs(t2)
PQ_DISTRIBUTE(T2 HASH HASH)
*/
t2.id as t2_id
, t2.filler as t2_filler
, t2.id2 as t2_id2
, t3.id as t3_id
, t3.filler as t3_filler
from
t1 t2
, t2 t3
where
t3.id = t2.id (+)
and regexp_replace(t3.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler (+), '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
and mod(t3.id2, 3) = 0
) v_5
, t1
where
v_1.t3_id = v_5.t3_id
and v_5.t2_id2 = t1.id2 (+) + 2001
and regexp_replace(v_5.t3_filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t1.filler (+), '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
)
;

---------------------------------------------------------------------------------------------------------------------------------------------------
-- Multiple DFO trees no parent / child (with different DOPs), separate slave sets, one active after the other (12.1: Still multiple DFO trees) --
---------------------------------------------------------------------------------------------------------------------------------------------------

set echo on timing on time on verify on

with a as (
select /*+ materialize monitor no_merge */
t1.id as t1_id
, regexp_replace(t3_filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') as t1_filler
, v1.*
from (
select /*+ no_merge pq_distribute(t3 hash hash) */
t2.id as t2_id
, t2.filler as t2_filler
, t2.id2 as t2_id2
, t3.id as t3_id
, t3.filler as t3_filler
from
t1 t2
, t2 t3
where
t3.id2 = t2.id
and regexp_replace(t3.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
and mod(t3.id2, 2) = 0
) v1
, t1
where
v1.t2_id2 = t1.id2
),
b as (
select /*+ materialize monitor no_merge */
t1.id as t1_id
, regexp_replace(t3_filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') as t1_filler
, v1.*
from (
select /*+ no_merge pq_distribute(t3 hash hash) */
t2.id as t2_id
, t2.filler as t2_filler
, t2.id2 as t2_id2
, t3.id as t3_id
, t3.filler as t3_filler
from
t3 t2
, t4 t3
where
t3.id2 = t2.id
and regexp_replace(t3.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
and mod(t3.id2, 2) = 0
) v1
, t5 t1
where
v1.t2_id2 = t1.id2
)
select max(t1_id), max(t1_filler), max(t2_id), max(t2_filler), max(t3_id), max(t3_filler) from (
select /*+ no_merge */
a.t1_id
, a.t1_filler
, a.t2_id
, a.t2_filler
, a.t3_id
, regexp_replace(a.t3_filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') as t3_filler
from
a
, b
where
a.t3_id = b.t3_id
and regexp_replace(a.t3_filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(b.t3_filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
and mod(a.t1_id, 4) = 0
and mod(b.t1_id, 4) = 0
)
;

-------------------------------------------------------------------------------------------------------------------------------
-- Multiple DFO trees parent / child (with different DOPs), separate slave sets, concurrently active (12.1: Single DFO tree) --
-------------------------------------------------------------------------------------------------------------------------------

set echo on timing on time on

alter session enable parallel dml;

truncate table x;

insert /*+ append parallel(x 4) */ into x
select /*+ leading(v1 v2) */
v_1.id
, v_1.id2
, v_1.filler
from (
select
id
, id2
, filler
from (
select /*+ parallel(t2 4) no_merge */
rownum as id
, t2.id2
, t2.filler
from
t2
where
mod(t2.id2, 3) = 0
and regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'i')
) v1
) v_1
, (
select
id
, id2
, filler
from (
select /*+ parallel(t2 8) no_merge */
rownum as id
, t2.id2
, t2.filler
from
t2
where
mod(t2.id2, 3) = 0
and regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'i')
) v2
) v_2
where
v_1.id = v_2.id
and v_1.filler = v_2.filler
;

commit;

--------------------------------------------------------------------------------------------------------------------------------------
-- Multiple DFO trees parent / child (with different DOPs), separate slave sets, *not* concurrently active (12.1: Single DFO tree) --
--------------------------------------------------------------------------------------------------------------------------------------

set echo on timing on time on

alter session enable parallel dml;

truncate table x;

insert /*+ append parallel(x 4) */ into x
select
v1.*
from (
select /*+ parallel(t2 4) */
lag(t2.id) over (order by regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')) as id
, t2.id2
, regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') as filler
from
t2
where
mod(t2.id2, 3) = 0
) v1
, (
select /*+ parallel(t2 8) */
lag(id) over (order by regexp_replace(filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')) as id
, id2
, regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') as filler
from
t2
where
mod(t2.id2, 3) = 0
) v2
where
v1.id = v2.id
and v1.filler = v2.filler
;

commit;

----------------------------------------------------------------------------------------------------------------------------------------------------------------------
-- Multiple DFO trees, no parent/child, multiple DFO tree starts, no separate slave sets, concurrently active (12.1: Single DFO tree, new parallel FILTER/SUBQUERY) --
----------------------------------------------------------------------------------------------------------------------------------------------------------------------

set echo on timing on time on

select /*+ no_merge(x) */
*
from (
select
v1.filler
, (select /*+ parallel(x 2) */ id from t2 x where x.id = v1.id) as id
, (select /*+ parallel(x 2) */ id2 from t2 x where x.id = v1.id) as id2
from (
select /*+ parallel(t2 4) */
t2.id
, t2.id2
, t2.filler
from
t2
) v1
, (
select /*+ parallel(t2 8) */
t2.id
, t2.id2
, t2.filler
from
t2
) v2
where
v1.id = v2.id
and v1.filler = v2.filler
) x
where
rownum <= 100
;

----------------------------------------------------------------------------------------------------------------------------------------------------------
-- Multiple DFO trees, no parent/child, multiple DFO tree starts, separate slave sets, concurrently active (12.1: Single DFO tree, new parallel FILTER) --
----------------------------------------------------------------------------------------------------------------------------------------------------------

set echo on timing on time on

select /*+ parallel(t2 8)
parallel(t3 8)
leading(t3 t2)
use_hash(t3 t2)
swap_join_inputs(t2)
PQ_DISTRIBUTE(T2 HASH HASH)
--PQ_FILTER(@"SEL$1" NONE)
*/
t2.id as t2_id
, t2.filler as t2_filler
, t2.id2 as t2_id2
, t3.id as t3_id
, t3.filler as t3_filler
from
t1 t2
, t2 t3
where
t3.id2 = t2.id2 (+)
and regexp_replace(t3.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler (+), '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
and mod(t3.id2, 3) = 0
and not exists (select /*+ no_unnest parallel(t2 2) */ null from t2 x where x.id2 = t2.id2)
;

-------------------------------------------------------------------------------------------------------------------------------------------------------------
-- Multiple DFO trees, no parent/child, multiple DFO tree starts, separate slave sets, concurrently active (12.1: Still multiple DFO trees, serial FILTER) --
-------------------------------------------------------------------------------------------------------------------------------------------------------------

set echo on timing on time on

alter session enable parallel dml;

truncate table x;

insert /*+ append parallel(x 4) */ into x
select /*+ leading(v1 v2) */
v_1.id
, v_1.id2
, v_1.filler
from (
select
id
, id2
, filler
from (
select /*+ parallel(t2 4) no_merge */
rownum as id
, t2.id2
, t2.filler
from
t2
where
mod(t2.id2, 3) = 0
and regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'i')
) v1
) v_1
, (
select
id
, id2
, filler
from (
select /*+ parallel(t2 8) no_merge */
rownum as id
, t2.id2
, t2.filler
from
t2
where
mod(t2.id2, 3) = 0
and regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') = regexp_replace(t2.filler, '^([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'i')
) v2
) v_2
where
v_1.id = v_2.id
and v_1.filler = v_2.filler
and not exists (select /*+ no_unnest parallel(y 2) */ null from t2 y where y.id2 = v_1.id2)
;

commit;

New Version Of XPLAN_ASH Utility

Fri, 2014-03-28 17:26
A minor update 4.01 to the XPLAN_ASH utility is available for download.

As usual the latest version can be downloaded here.

These are the notes from the change log:

- More info for RAC Cross Instance Parallel Execution: Many sections now show a GLOBAL aggregate info in addition to instance-specific data

- The Parallel Execution Server Set detection and ASSUMED_DEGREE info now makes use of the undocumented PX_STEP_ID and PX_STEPS_ARG info (bit mask part of the PX_FLAGS column) on 11.2.0.2+

- Since version 4.0 added from 11.2.0.2 on the PX *MAX* DOP in the "SQL statement execution ASH Summary" based on the new PX_FLAGS column of ASH it makes sense to add a PX *MIN* DOP in the summary to see at one glance if different DOPs were used or not

- The "Active DOPs" column in the "Activity Timeline based on ASH" was extended/modified: The number in parantheses is no longer the simple count of samples but the Average Active Sessions (AAS) per DFO / bucket.

From 11.2.0.2 it now shows also the DOP of the DFO in brackets, so the
output could look now like this:

1[16] (14.5)

which means DFO 1 at a DOP of 16 had an AAS value of 14.5 for this time
bucket. If there are multiple DFOs active in the time bucket, they are
separated by commas:

1[16] (3.5),2[4] (1.5)

which means DFO 1 at a DOP of 16 had an AAS value of 3.5 and DFO 2 at a
DOP of 4 had an AAS value of 1.5 for this time bucket

A new version 4.1 is already underway that includes new 12c features, so stay tuned.

Parallel Execution Overhead

Tue, 2014-03-18 09:46
I've started a new mini series about "Parallel Execution Skew" at AllThingsOracle.com. In order to avoid bloating the articles too much there I'll post some accompanying notes here on my blog.

If you follow the initial post you'll see that the sample query demonstrated there scales almost perfectly with Parallel Execution - the serial execution takes 57 seconds on my test system, whereas the a Parallel Execution at a DOP of 4 takes something between 14 and 15 seconds, so almost four times faster.

Here are again the corresponding two table join queries along with their execution plans that I haven't included in the "Parallel Execution Skew" article - the plans are generated using Oracle database version 12c which you can see from the new HYBRID HASH distribution method for parallel plans:

Serial query:

select count(t_2_filler) from (
select /*+ monitor
no_parallel
leading(t_1 t_2)
use_hash(t_2)
no_swap_join_inputs(t_2)
*/
t_1.id as t_1_id
, t_1.filler as t_1_filler
, t_2.id as t_2_id
, t_2.filler as t_2_filler
from t_1
, t_2
where
t_2.fk_id_uniform = t_1.id
and regexp_replace(t_2.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') >= regexp_replace(t_1.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
);

Execution plan:

------------------------------------------------------------------------------------
| Id | Operation | Name | Rows | Bytes |TempSpc| Cost (%CPU)| Time |
------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | 1 | 213 | | 24508 (1)| 00:00:01 |
| 1 | SORT AGGREGATE | | 1 | 213 | | | |
|* 2 | HASH JOIN | | 100K| 20M| 225M| 24508 (1)| 00:00:01 |
| 3 | TABLE ACCESS FULL| T_1 | 2000K| 202M| | 796 (2)| 00:00:01 |
| 4 | TABLE ACCESS FULL| T_2 | 2000K| 204M| | 1289 (2)| 00:00:01 |
------------------------------------------------------------------------------------

Predicate Information (identified by operation id):
---------------------------------------------------

2 - access("T_2"."FK_ID_UNIFORM"="T_1"."ID")
filter( REGEXP_REPLACE ("T_2"."FILLER",'^\s+([[:alnum:]]+)\s+$','
\1',1,1,'c')>= REGEXP_REPLACE ("T_1"."FILLER",'^\s+([[:alnum:]]+)\s+$','
\1',1,1,'c'))
Parallel Query:

select count(t_2_filler) from (
select /*+ monitor
leading(t_1 t_2)
use_hash(t_2)
no_swap_join_inputs(t_2)
pq_distribute(t_2 hash hash)
*/
t_1.id as t_1_id
, t_1.filler as t_1_filler
, t_2.id as t_2_id
, t_2.filler as t_2_filler
from t_1
, t_2
where
t_2.fk_id_uniform = t_1.id
and regexp_replace(t_2.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') >= regexp_replace(t_1.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
);

Execution plan:

------------------------------------------------------------------------------------------------------------------------------
| Id | Operation | Name | Rows | Bytes |TempSpc| Cost (%CPU)| Time | TQ |IN-OUT| PQ Distrib |
------------------------------------------------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | 1 | 213 | | 6186 (1)| 00:00:01 | | | |
| 1 | SORT AGGREGATE | | 1 | 213 | | | | | | |
| 2 | PX COORDINATOR | | | | | | | | | |
| 3 | PX SEND QC (RANDOM) | :TQ10002 | 1 | 213 | | | | Q1,02 | P->S | QC (RAND) |
| 4 | SORT AGGREGATE | | 1 | 213 | | | | Q1,02 | PCWP | |
|* 5 | HASH JOIN | | 100K| 20M| 56M| 6186 (1)| 00:00:01 | Q1,02 | PCWP | |
| 6 | PX RECEIVE | | 2000K| 202M| | 221 (1)| 00:00:01 | Q1,02 | PCWP | |
| 7 | PX SEND HYBRID HASH | :TQ10000 | 2000K| 202M| | 221 (1)| 00:00:01 | Q1,00 | P->P | HYBRID HASH|
| 8 | STATISTICS COLLECTOR | | | | | | | Q1,00 | PCWC | |
| 9 | PX BLOCK ITERATOR | | 2000K| 202M| | 221 (1)| 00:00:01 | Q1,00 | PCWC | |
| 10 | TABLE ACCESS FULL | T_1 | 2000K| 202M| | 221 (1)| 00:00:01 | Q1,00 | PCWP | |
| 11 | PX RECEIVE | | 2000K| 204M| | 358 (2)| 00:00:01 | Q1,02 | PCWP | |
| 12 | PX SEND HYBRID HASH | :TQ10001 | 2000K| 204M| | 358 (2)| 00:00:01 | Q1,01 | P->P | HYBRID HASH|
| 13 | PX BLOCK ITERATOR | | 2000K| 204M| | 358 (2)| 00:00:01 | Q1,01 | PCWC | |
| 14 | TABLE ACCESS FULL | T_2 | 2000K| 204M| | 358 (2)| 00:00:01 | Q1,01 | PCWP | |
------------------------------------------------------------------------------------------------------------------------------

Predicate Information (identified by operation id):
---------------------------------------------------

5 - access("T_2"."FK_ID_UNIFORM"="T_1"."ID")
filter( REGEXP_REPLACE ("T_2"."FILLER",'^\s+([[:alnum:]]+)\s+$',' \1',1,1,'c')>= REGEXP_REPLACE
("T_1"."FILLER",'^\s+([[:alnum:]]+)\s+$',' \1',1,1,'c'))


Note: If you're interesting in learning more about reading Parallel Execution plans, Jonathan Lewis recently completed a series of posts about that topic

Looking at a corresponding Real-Time SQL Monitoring report (see the post at AllThingsOracle.com) of the Parallel Execution it can also be seen that it took approx. 58 seconds of Database Time (spread across four worker processes), so really pretty close to the duration / Database Time of the serial execution.

However in real-life cases that have to process large amounts of data and use more complex execution plans you'll usually see Parallel Execution not scaling perfectly, even if the work distribution is not the problem as discussed in the article series.

To some degree this is because Parallel Execution comes with overhead, and therefore the database actually has to work more than with a comparable serial execution.

The purpose of this post is to demonstrate this using the set-up used in the article series, by simply extending the two table join used there to a three table join.

Here is the corresponding serial query:


select count(t_2_filler) from (
select /*+ monitor
leading(t_1 t_2)
use_hash(t_2)
no_swap_join_inputs(t_2)
use_hash(a)
no_swap_join_inputs(a)
opt_estimate(join (t_1 t_2) scale_rows=20)
no_parallel
*/
t_1.id as t_1_id
, t_1.filler as t_1_filler
, t_2.id as t_2_id
, t_2.filler as t_2_filler
from t_1
, t_1 a
, t_2
where
t_2.fk_id_uniform = t_1.id
and t_2.id = a.id
and regexp_replace(t_2.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') >= regexp_replace(t_1.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
);

Execution plan:

-------------------------------------------------------------------------------------
| Id | Operation | Name | Rows | Bytes |TempSpc| Cost (%CPU)| Time |
-------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | 1 | 225 | | 49560 (1)| 00:00:02 |
| 1 | SORT AGGREGATE | | 1 | 225 | | | |
|* 2 | HASH JOIN | | 2006K| 430M| 441M| 49560 (1)| 00:00:02 |
|* 3 | HASH JOIN | | 2003K| 418M| 225M| 25172 (1)| 00:00:01 |
| 4 | TABLE ACCESS FULL| T_1 | 2000K| 202M| | 796 (2)| 00:00:01 |
| 5 | TABLE ACCESS FULL| T_2 | 2000K| 215M| | 1289 (2)| 00:00:01 |
| 6 | TABLE ACCESS FULL | T_1 | 2000K| 11M| | 795 (2)| 00:00:01 |
-------------------------------------------------------------------------------------

Predicate Information (identified by operation id):
---------------------------------------------------

2 - access("T_2"."ID"="A"."ID")
3 - access("T_2"."FK_ID_UNIFORM"="T_1"."ID")
filter( REGEXP_REPLACE ("T_2"."FILLER",'^\s+([[:alnum:]]+)\s+$','
\1',1,1,'c')>= REGEXP_REPLACE ("T_1"."FILLER",'^\s+([[:alnum:]]+)\s+$','
\1',1,1,'c'))
And here is the parallel query:

select count(t_2_filler) from (
select /*+ monitor
leading(t_1 t_2)
use_hash(t_2)
no_swap_join_inputs(t_2)
pq_distribute(t_2 hash hash)
use_hash(a)
no_swap_join_inputs(a)
pq_distribute(a hash hash)
opt_estimate(join (t_1 t_2) scale_rows=20)
*/
t_1.id as t_1_id
, t_1.filler as t_1_filler
, t_2.id as t_2_id
, t_2.filler as t_2_filler
from t_1
, t_1 a
, t_2
where
t_2.fk_id_uniform = t_1.id
and t_2.id = a.id
and regexp_replace(t_2.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c') >= regexp_replace(t_1.filler, '^\s+([[:alnum:]]+)\s+$', lpad('\1', 10), 1, 1, 'c')
);
Execution plan:

----------------------------------------------------------------------------------------------------------------------------------
| Id | Operation | Name | Rows | Bytes |TempSpc| Cost (%CPU)| Time | TQ |IN-OUT| PQ Distrib |
----------------------------------------------------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | 1 | 225 | | 12473 (1)| 00:00:01 | | | |
| 1 | SORT AGGREGATE | | 1 | 225 | | | | | | |
| 2 | PX COORDINATOR | | | | | | | | | |
| 3 | PX SEND QC (RANDOM) | :TQ10004 | 1 | 225 | | | | Q1,04 | P->S | QC (RAND) |
| 4 | SORT AGGREGATE | | 1 | 225 | | | | Q1,04 | PCWP | |
|* 5 | HASH JOIN | | 2006K| 430M| 110M| 12473 (1)| 00:00:01 | Q1,04 | PCWP | |
| 6 | PX RECEIVE | | 2003K| 418M| | 6354 (1)| 00:00:01 | Q1,04 | PCWP | |
| 7 | PX SEND HYBRID HASH | :TQ10002 | 2003K| 418M| | 6354 (1)| 00:00:01 | Q1,02 | P->P | HYBRID HASH|
| 8 | STATISTICS COLLECTOR | | | | | | | Q1,02 | PCWC | |
|* 9 | HASH JOIN BUFFERED | | 2003K| 418M| 56M| 6354 (1)| 00:00:01 | Q1,02 | PCWP | |
| 10 | PX RECEIVE | | 2000K| 202M| | 221 (1)| 00:00:01 | Q1,02 | PCWP | |
| 11 | PX SEND HYBRID HASH | :TQ10000 | 2000K| 202M| | 221 (1)| 00:00:01 | Q1,00 | P->P | HYBRID HASH|
| 12 | STATISTICS COLLECTOR | | | | | | | Q1,00 | PCWC | |
| 13 | PX BLOCK ITERATOR | | 2000K| 202M| | 221 (1)| 00:00:01 | Q1,00 | PCWC | |
| 14 | TABLE ACCESS FULL | T_1 | 2000K| 202M| | 221 (1)| 00:00:01 | Q1,00 | PCWP | |
| 15 | PX RECEIVE | | 2000K| 215M| | 358 (2)| 00:00:01 | Q1,02 | PCWP | |
| 16 | PX SEND HYBRID HASH | :TQ10001 | 2000K| 215M| | 358 (2)| 00:00:01 | Q1,01 | P->P | HYBRID HASH|
| 17 | PX BLOCK ITERATOR | | 2000K| 215M| | 358 (2)| 00:00:01 | Q1,01 | PCWC | |
| 18 | TABLE ACCESS FULL | T_2 | 2000K| 215M| | 358 (2)| 00:00:01 | Q1,01 | PCWP | |
| 19 | PX RECEIVE | | 2000K| 11M| | 221 (1)| 00:00:01 | Q1,04 | PCWP | |
| 20 | PX SEND HYBRID HASH | :TQ10003 | 2000K| 11M| | 221 (1)| 00:00:01 | Q1,03 | P->P | HYBRID HASH|
| 21 | PX BLOCK ITERATOR | | 2000K| 11M| | 221 (1)| 00:00:01 | Q1,03 | PCWC | |
| 22 | TABLE ACCESS FULL | T_1 | 2000K| 11M| | 221 (1)| 00:00:01 | Q1,03 | PCWP | |
----------------------------------------------------------------------------------------------------------------------------------

Predicate Information (identified by operation id):
---------------------------------------------------

5 - access("T_2"."ID"="A"."ID")
9 - access("T_2"."FK_ID_UNIFORM"="T_1"."ID")
filter( REGEXP_REPLACE ("T_2"."FILLER",'^\s+([[:alnum:]]+)\s+$',' \1',1,1,'c')>= REGEXP_REPLACE
("T_1"."FILLER",'^\s+([[:alnum:]]+)\s+$',' \1',1,1,'c'))
Note: The query uses an undocumented OPT_ESTIMATE hint to correct the bad join cardinality estimate caused by the complex REGEXP_REPLACE expression as part of the join condition - since the execution plan is pretty much dictated by the hints this is not strictly necessary in this case. For curiosity you could remove the OPT_ESTIMATE and PQ_DISTRIBUTE(A HASH HASH) hints. If you do so you'll notice that the optimizer uses then a BROADCAST distribution for the join result of T_1 and T_2 due to the incorrect cardinality estimate - so choosing the appropriate distribution method is another area where the cardinality estimates can cause a lot of trouble, in particular for under-estimates that lead to large row sources being distributed via broadcast.

On my test system the serial execution still takes approx. 59 seconds, so not much longer than the serial two table join, whereas the Parallel Execution takes between 28 and 30 seconds at a DOP of 4 - clearly not a four times improvement as for the simple two table join (14-15 seconds), although the work was distributed evenly across the worker processes.

We can see this confirmed by looking at a corresponding Real-Time SQL Monitoring report (click on the picture to enlarge):


Notice how for the majority of the time four Parallel Execution Servers were active, so work distribution is not the problem here. We can get a clue that some overhead was the problem here by looking at the "Time & Wait Statistics" section in the "Overview" at the top of the report: It shows a "Database Time" of 2 minutes (of which almost 100% are CPU time), so clearly the Parallel Execution had to work much more than the serial execution, which just took close to one minute.

What seems to happen here is that the overhead of the HASH JOIN BUFFERED operation that actually processes the probe row source of the hash join twice, once to buffer it (and by doing so buffering only rows that will survive the join according to the internal bitmap generated as part of the hash table build process (so false positives are possible), see the HASH JOIN BUFFERED post for more background information), and a second time to perform the actual join by processing the previously buffered data, causes the costly regular expression that is part of the join condition to be evaluated twice as many times as the serial execution has to do.

Now this particular case is an exaggeration of the overhead due to the CPU intensive expression - nevertheless it is a good demonstration that you shouldn't always expect Parallel Execution to scale perfectly, even with almost perfect work distribution.

Analysing Parallel Execution Skew - Video Tutorial

Tue, 2014-03-18 09:22
Along the new mini series "Parallel Execution Skew" at "AllThingsOracle.com" that provides some information what you can do if you happen to have a parallel SQL execution that is affected by work distribution problems I'm publishing a series of video tutorials that explain how you can actually detect and measure whether a SQL execution is affected by skew or not.

Originally this tutorial was planned as one part (Part 5 actually) of the XPLAN_ASH video tutorial series, however so far I've only managed to publish just the inital two parts of that series, and these are already a bit outdated as I've released new versions of the XPLAN_ASH tool with significant changes and enhancements since then.

Since this tutorial covers a lot of information and I don't want to bother you watching a single multi-hour video, I'm splitting this tutorial into four parts:
  • Brief introduction
  • Explaining Data Flow Operations (DFOs)
  • Analysing Parallel Execution Skew without Diagnostic / Tunink Pack license
  • Analysing Parallel Execution Skew with Diagnostic / Tunink Pack license
So here is the initial part, a brief introduction what the "Parallel Execution Skew" problem is about:


The links mentioned at the beginning of the video are:
Webinar recording "Analysing and Troubleshooting Parallel Execution"
OTN mini series "Understanding Parallel Execution"