SreeRam Hadoop Notes: July 2016

Saturday 23 July 2016

Pig Lab 11: Executing Scripts and UDFs.

Excecuting scripts(pig) :-
____________________________

three commands (operators) are used to execute scripts.

i) pig
ii) exec.
iii) run.

i) Pig:-

to execute script from Command Line(operating sys).

$ pig script1.pig
--> script will be executed,
but relation aliases are not available with grunt shell. so that we can not reuse them.

ii) exec:

--> to execute script from grunt shell.
still aliases will not be available with grunt.
so "No reuse".

grunt> exec script1.pig

__________________________________

iii) run:

---> to execute script from grunt shell,
Aliases will be available with grunt. So we can reuse them.

grunt> run script1.pig
______________________________

run:
adv --> aliases will be available.
disadv --> overriding previous aliases with same name.

exec:
adv --> aliases will not be available.
so no -0verriding.
disadv --> no reusability.

pig :
adv --->
-- used for production operators.
--- can be called other evenvironments , like shell script.

disadv --> aliases will not be reflected into grunt.

_________________________________

Pig Udfs:
_____________j

User defined functions.

adv:
i) Custom functionality.
ii) Reusabilty .

Udf life cycle:

step 1) Develop UDF class
step 2) Export into jar file.
step 3) register jar file into pig.
step 4) create temporary function for the UDF class.
step 5) call the function.
__________________________

[training@localhost ~]$ cat > samp1
101,ravi
102,mani
103,Deva
104,Devi
105,AmAr
[training@localhost ~]$ hadoop fs -copyFromLocal samp1 piglab
[training@localhost ~]$

grunt> s = load 'piglab/samp1'
>> using PigStorage(',')
>> as (id:int, name:chararray);

eclipse navigations:
i) create java project.

file --> new --> java project.
ex: PigDemo

ii) create package>

pigDemo ---> new ---> package.

ex: pig.test

iii) configure pig jar.

src -- build path --> configure build path --> libraries ---> add external jars.

/usr/lib/pig/pig-core.jar

iv) create jar class

pig.test ---> new --> class

FirstUpper

v) export into jar.

pigdemo ---> export --> java --java jar --
/home/training/Desktop/pigudfs.jar

_____________________
package pig.test;
import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;

public class FirstUpper extends EvalFunc<String>
{
public String exec(Tuple v) throws IOException
{ // raVI --> Ravi

String name = (String)v.get(0);
String fc = name.substring(0,1).toUpperCase();
String rc = name.substring(1).toLowerCase();
String n = fc+rc;
return n;
}

}

grunt> register Desktop/pigudfs.jar;

grunt> define cconvert pig.test.FirstUpper();

grunt> r = foreach s generate
>> id, cconvert(name) as name;

grunt> dump r;

______________________________

[training@localhost ~]$ cat > f1
100 200 120
300 450 780
120 56 90
1000 3456 789
[training@localhost ~]$ hadoop fs -copyFromLocal f1 piglab
[training@localhost ~]$

task:
write udf , to find max value for a row.

package pig.test;

import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;

public class RowMax extends EvalFunc<Integer>
{
public Integer exec(Tuple v) throws IOException
{
int a = (Integer)v.get(0);
int b = (Integer)v.get(1);
int c = (Integer)v.get(2);
int big =0;
if (a>big) big=a;
if (b>big) big=b;
if (c>big) big=c;
return new Integer(big);
}

}

export into jar.

/home/training/Desktop/pigudfs.jar

grunt> s1 = load 'piglab/f1'
>> as (a:int, b:int, c:int);
grunt> register Desktop/pigudfs.jar;
grunt> define rowmax pig.test.RowMax();
grunt> r1 = foreach s1 generate *,
>> rowmax(*) as rmax;
grunt> dump r1
(100,200,120,200)
(300,450,780,780)
(120,56,90,120)
(1000,3456,789,3456)

[training@localhost ~]$ cat f2
-10,-30,-56,-23,-21,-5
1,2,3,45,67,9
[training@localhost ~]$ hadoop fs -copyFromLocal f2 piglab
[training@localhost ~]$

package pig.test;

import java.io.IOException;
import java.util.List;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;

public class DynRowMax extends EvalFunc<Integer>
{
public Integer exec(Tuple v) throws IOException
{
List<Object> olist = v.getAll();
int max = 0;// 10 30 20
int cnt=0;
for( Object o : olist){
cnt++;
int val= (Integer)o;
if (cnt==1) max = val;
max = Math.max(val, max);
}
return new Integer(max);
}

}

export into jar /home/training/Desktop/pigudfs.jar

grunt> register Desktop/pigudfs.jar;
grunt> define dynmax pig.test.DynRowMax();
grunt> ss = load 'piglab/f2'
>> using PigStorage(',')
>> as (a:int, b:int, c:int, d:int,
>> e:int, f:int);
grunt> define rmax pig.test.RowMax();
grunt> rr = foreach ss generate *,
>> dynmax(*) as max;
grunt> dump rr

Pig Lab10 : Quarterly sales Report and comparing sales with Previous Quarter. Cross .

grunt> raw = load 'piglab/sales'
>> using PigStorage(',')
>> as (dt:chararray, amt:int);
grunt> msales = foreach raw generate
>> SUBSTRING(dt,0,2) as m, amt ;
grunt> describe msales;
msales: {m: chararray,amt: int}
grunt> msales = foreach msales generate
>> (int)m, amt;
grunt> describe msales;
msales: {m: int,amt: int}
grunt> dump msales;

grunt> qsales = foreach msales generate
>> (m<4 ? 1:
>> (m<7 ? 2:
>> (m<10 ? 3:4))) as q, amt ;
grunt> dump qsales

grunt> qsales = foreach msales generate
>> (m<4 ? 1:
>> (m<7 ? 2:
>> (m<10 ? 3:4))) as q, amt ;
grunt> dump qsales

grunt> rep2 = foreach rep generate *;
grunt> cr = cross rep, rep2;
grunt> describe cr
cr: {rep::q: int,rep::tot: long,rep2::q: int,rep2::tot: long}
grunt> cr = foreach cr generate
>> rep::q as q1, rep2::q as q2,
>> rep::tot as tot1, rep2::tot as tot2;

grunt> describe cr;
cr: {q1: int,q2: int,tot1: long,tot2: long}
grunt> dump cr;

grunt> fltd = filter cr by ((q1-q2)==1);
grunt> dump fltd;

grunt> res = foreach fltd generate *,
>> ((tot1-tot2)*100)/tot2 as pgrowth;
grunt> dump res

(3,2,70000,355000,-80)
(2,1,355000,140000,153)
(4,3,290000,70000,314)

_________________________________

pig lab9 : Language neutralization

Laguage neutralization.

ex: hindi to english.

[training@localhost ~]$ gedit comments
[training@localhost ~]$ gedit dictionary
[training@localhost ~]$ hadoop fs -copyFromLocal comments pdemo
[training@localhost ~]$ hadoop fs -copyFromLocal dictionary pdemo
[training@localhost ~]$

grunt> comms = load 'pdemo/comments'
>> as (line:chararray);
grunt> dict = load 'pdemo/dictionary'
>> using PigStorage(',')
>> as (hindi:chararray, eng:chararray);
grunt> words = foreach comms
>> generate FLATTEN(TOKENIZE(line)) as word;
grunt>

grunt> describe words
words: {word: chararray}
grunt> describe dict
dict: {hindi: chararray,eng: chararray}
grunt> j = join words by word left outer,
>> dict by hindi;
grunt> jj = foreach j generate words::word as word, dict::hindi as hindi, dict::eng as eng;
grunt> dump jj
(is,,)
(is,,)
(is,,)
(xyz,,)
(acha,acha,good)
(bacha,bacha,small)
(lucha,lucha,worst)
(hadoop,,)
(oracle,,)
____________________

grunt> rset = foreach jj generate
>> (hindi is not null ? eng:word) as word;
grunt> dump rset

(is)
(is)
(is)
(xyz)
(good)
(small)
(worst)
(hadoop)
(oracle)

__________________________

Then we can apply sentiment on this restult set.

__________________________-

Saturday 9 July 2016

Hive Lab 14: Joins and Cartisian Product

Joins
______________

used to collect data from two or more tables.

joins are two types.

i) Inner Joins
ii) External Joins.

Inner Joins:
matching rows with given join condition.

ex:
tab1 ---> name,city
___________
A,hyd
B,del
c,hyd
d,del
e,pune
f,mumbai

tab2 ---> city, dept
_________
hyd,production
del,hr
pune,accounts
chennai,admin
_______________

inner join --> based on city.

a,hyd,hyd,production
B,del,del,hr
c,hyd,hyd,production
d,del,del,hr
e,pune,pune,accounts

outer join -->
matching records and non-matching records.

outer joins are 3 types.
i) left outer join.
ii) right outer join
iii) full outer join.

left outer join:

matchings + non matchings of left side.
( complete presence from left side table).

tabx:--> id, dno
____________
101,11
102,14
___________

taby ---> dno,loc
_____________
11,hyd
13,del
_____________--

tabx lefter outer join taby --> based on dno.

101,11,11,hyd
102,14,null,null

right outer join -->
matchings and non matchings of right side table.

(complete presence from right side table ).

tabx right outer join taby -->based on dno.

101,11,11,hyd
null,null,13,del

full outer join --->
matchings, non matchings of left side,
and non matchings of right side table.

tabx full outer join taby ---> based on dno.

101,11,11,hyd
102,14,null,null
null,null,13,del

___________________________________

[training@localhost ~]$ cat emp
101,vino,26000,m,11
102,Sri,25000,f,11
103,mohan,13000,m,13
104,lokitha,8000,f,12
105,naga,6000,m,13
101,janaki,10000,f,12
201,aaa,30000,m,12
202,bbbb,50000,f,13
203,ccc,10000,f,13
204,ddddd,50000,m,13
304,xxx,70000,m,14
305,yyy,80000,f,15
[training@localhost ~]$ cat dept
11,marketing,hyd
12,hr,del
13,finance,hyd
20,prod,hyd
21,admin,chennai
[training@localhost ~]$

hive> create database joinsdb;
OK
Time taken: 1.761 seconds
hive> use joinsdb;
OK
Time taken: 0.027 seconds
hive> create table emp(id int, name string,
> sal int, sex string, dno int)
> row format delimited
> fields terminated by ',';
OK
Time taken: 0.297 seconds
hive> load data local inpath 'emp' into table emp;
Copying data from file:/home/training/emp
Copying file: file:/home/training/emp
Loading data to table joinsdb.emp
OK
Time taken: 0.207 seconds
hive> select * from emp;
OK
101 vino 26000 m 11
102 Sri 25000 f 11
103 mohan 13000 m 13
104 lokitha 8000 f 12
105 naga 6000 m 13
101 janaki 10000 f 12
201 aaa 30000 m 12
202 bbbb 50000 f 13
203 ccc 10000 f 13
204 ddddd 50000 m 13
304 xxx 70000 m 14
305 yyy 80000 f 15
Time taken: 0.194 seconds
hive>

hive> load data local inpath 'dept'
> into table dept;
Copying data from file:/home/training/dept
Copying file: file:/home/training/dept
Loading data to table joinsdb.dept
OK
Time taken: 0.12 seconds
hive> select * from dept;
OK
11 marketing hyd
12 hr del
13 finance hyd
20 prod hyd
21 admin chennai
Time taken: 0.068 seconds
hive>

Inner Join:

hive> select id,name,sal,sex,l.dno,r.dno,
> dname,loc
> from emp l join dept r
> on (l.dno=r.dno);

101 vino 26000 m 11 11 marketing hyd
102 Sri 25000 f 11 11 marketing hyd
104 lokitha 8000 f 12 12 hrdel
101 janaki 10000 f 12 12 hrdel
201 aaa 30000 m 12 12 hrdel
103 mohan 13000 m 13 13 finance hyd
105 naga 6000 m 13 13 finance hyd
202 bbbb 50000 f 13 13 finance hyd
203 ccc 10000 f 13 13 finance hyd
204 ddddd 50000 m 13 13 finance hyd
Time taken: 14.11 seconds
hive>

Left outer Join :

hive> select id,name,sal,sex,l.dno,r.dno,
> dname,loc
> from emp l left outer join dept r
> on (l.dno=r.dno);

101 vino 26000 m 11 11 marketing hyd
102 Sri 25000 f 11 11 marketing hyd
104 lokitha 8000 f 12 12 hrdel
101 janaki 10000 f 12 12 hrdel
201 aaa 30000 m 12 12 hrdel
103 mohan 13000 m 13 13 finance hyd
105 naga 6000 m 13 13 finance hyd
202 bbbb 50000 f 13 13 finance hyd
203 ccc 10000 f 13 13 finance hyd
204 ddddd 50000 m 13 13 finance hyd
304 xxx 70000 m 14 NULL NULL NULL
305 yyy 80000 f 15 NULL NULL NULL
Time taken: 12.786 seconds
hive>

Right outer Join:

hive> select id,name,sal,sex,l.dno,r.dno,
> dname,loc
> from emp l right outer join dept r
> on (l.dno=r.dno);

101 vino 26000 m 11 11 marketing hyd
102 Sri 25000 f 11 11 marketing hyd
104 lokitha 8000 f 12 12 hrdel
101 janaki 10000 f 12 12 hrdel
201 aaa 30000 m 12 12 hrdel
103 mohan 13000 m 13 13 finance hyd
105 naga 6000 m 13 13 finance hyd
202 bbbb 50000 f 13 13 finance hyd
203 ccc 10000 f 13 13 finance hyd
204 ddddd 50000 m 13 13 finance hyd
NULL NULL NULL NULL NULL 20 prod hyd
NULL NULL NULL NULL NULL 21 admin chennai
Time taken: 12.429 seconds
hive>

Full outer Join;

hive> select id,name,sal,sex,l.dno,r.dno,
> dname,loc
> from emp l full outer join dept r
> on (l.dno=r.dno);

Denormalizing:

hive> create table info(id int, name string,
> sal int, sex string , dname string,
> loc string);
OK
Time taken: 0.04 seconds
hive> insert overwrite table info
> select id, name, sal, sex, dname, loc
> from emp l full outer join dept r
> on (l.dno=r.dno);

hive> select * from info;
OK
101 vino 26000 m marketing hyd
102 Sri 25000 f marketing hyd
104 lokitha 8000 f hr del
101 janaki 10000 f hr del
201 aaa 30000 m hr del
103 mohan 13000 m finance hyd
105 naga 6000 m finance hyd
202 bbbb 50000 f finance hyd
203 ccc 10000 f finance hyd
204 ddddd 50000 m finance hyd
304 xxx 70000 m NULL NULL
305 yyy 80000 f NULL NULL
NULL NULL NULL NULL prod hyd
NULL NULL NULL NULL admin chennai
Time taken: 0.071 seconds
hive>

_________________

Task:

hive> create table projects(dno1 int, dno2 int,
> sal int);
OK
Time taken: 0.072 seconds
hive> insert overwrite table projects
> select l.dno, r.dno, sal from
> emp l full outer join dept r
> on (l.dno=r.dno);

hive> create table proj(stat string, sal int);
hive> insert overwrite table proj
> select
> if(dno1 is not null and dno2 is not null,
> 'Working',
> if(dno2 is null,'BenchTeam','BenchProj')),
> sal from projects;

hive> select * from proj;
OK
Working 26000
Working 25000
Working 8000
Working 10000
Working 30000
Working 13000
Working 6000
Working 50000
Working 10000
Working 50000
BenchTeam 70000
BenchTeam 80000
BenchProj NULL
BenchProj NULL
Time taken: 0.087 seconds
hive>

hive> insert overwrite table proj
> select stat, if(sal is null,0,sal)
> from proj;

hive> create table summary(stat string,
> totsal int, cnt int);
OK
Time taken: 0.042 seconds
hive> insert overwrite table summary
> select stat, sum(sal), count(*)
> from proj
> group by stat;

hive> select * from summary;
OK
BenchProj 0 2
BenchTeam 150000 2
Working 228000 10

___________________________

[training@localhost ~]$ cat trans
01/01/2010,30000
01/07/2010,40000
01/08/2010,30000
01/26/2010,40000
02/01/2010,30000
02/07/2010,40000
02/09/2010,30000
02/28/2010,40000
03/01/2010,80000
03/07/2010,40000
04/01/2010,30000
04/17/2010,70000
04/18/2010,80000
04/26/2010,90000
05/01/2010,10000
05/07/2010,20000
05/08/2010,30000
05/26/2010,60000
06/01/2010,30000
06/17/2010,50000
07/01/2010,30000
07/07/2010,40000
08/08/2010,10000
08/26/2010,20000
09/01/2010,90000
09/07/2010,40000
09/18/2010,30000
09/30/2010,40000
10/01/2010,30000
10/07/2010,40000
10/08/2010,30000
11/26/2010,40000
11/01/2010,30000
12/07/2010,40000
12/08/2010,30000
12/26/2010,40000
[training@localhost ~]$

hive> create table rawsales(dt string,
> amt int)
row format delimited
fields terminated by ',';
OK
Time taken: 0.057 seconds
hive> load data local inpath 'trans'
> into table rawsales;
Copying data from file:/home/training/trans
Copying file: file:/home/training/trans
Loading data to table joinsdb.rawsales
OK
Time taken: 0.514 seconds
hive>

create table raw2(dt array<string> , amt int);

insert overwrite table raw2
select split(dt,'/') , amt from rawsales;

hive> select * from raw2 limit 3;
OK
["01","01","2010"] 30000
["01","07","2010"] 40000
["01","08","2010"] 30000
Time taken: 0.061 seconds
hive> insert overwrite table sales
> select
> concat(dt[2],'-',dt[0],'-',dt[1]), amt
> from raw2;

hive> create table sales2(mon int, amt int);
OK
Time taken: 0.044 seconds
hive> insert overwrite table sales2
> select month(dt), amt from sales;

hive> create table rep1(mon int, tot int);
OK
Time taken: 0.034 seconds
hive> insert overwrite table rep1
> select mon, sum(amt) from sales2
> group by mon;

hive> create table carts(m1 int, m2 int,
> tot1 int, tot2 int);
OK
Time taken: 0.044 seconds
hive> insert overwrite table carts
> select l.mon , r.mon , l.tot, r.tot
> from rep1 l join rep1 r;

hive> insert overwrite table carts
> select * from carts
> where (m1-m2)=1;

hive> select * from carts;
OK
12 11 110000 70000
7 6 70000 80000
2 1 140000 140000
8 7 30000 70000
4 3 270000 120000
9 8 200000 30000
10 9 100000 200000
5 4 120000 270000
11 10 70000 100000
3 2 120000 140000
6 5 80000 120000

hive> alter table carts add columns(per int);
OK
Time taken: 0.067 seconds
hive> insert overwrite table carts
> select m1,m2,tot1,tot2,
> ((tot1-tot2)*100)/tot2 from carts;

hive> select * from carts;
OK
12 11 110000 70000 57
7 6 70000 80000 -12
2 1 140000 140000 0
8 7 30000 70000 -57
4 3 270000 120000 125
9 8 200000 30000 566
10 9 100000 200000 -50
5 4 120000 270000 -55
11 10 70000 100000 -30
3 2 120000 140000 -14
6 5 80000 120000 -33
Time taken: 0.061 seconds
hive>

Wednesday 6 July 2016

Hive Lab 13 : Eliminating Duplicates and Unions (merging)

[training@localhost ~]$ cat dupes
101,aaa,10000
101,bbb,20000
101,aaa,10000
101,aaa,10000
101,aaa,10000
102,bbb,40000
103,cccc,50000
102,bbb,40000
102,bbb,40000
[training@localhost ~]$

hive> create database hdp;
OK
Time taken: 1.702 seconds
hive> use hdp;
OK
Time taken: 0.018 seconds
hive> create table info(id int, name string,
> sal int)
> row format delimited fields terminated
> by ',';
OK
Time taken: 0.439 seconds
hive>

hive> load data local inpath 'dupes'
> into table info;
Copying data from file:/home/training/dupes
Copying file: file:/home/training/dupes
Loading data to table hdp.info
OK
Time taken: 0.209 seconds
hive> select * from info;
OK
101 aaa 10000
101 bbb 20000
101 aaa 10000
101 aaa 10000
101 aaa 10000
102 bbb 40000
103 cccc 50000
102 bbb 40000
102 bbb 40000
Time taken: 0.201 seconds
hive>
hive> select distinct(id),name,sal
> from info;

101 aaa 10000
101 bbb 20000
102 bbb 40000
103 cccc 50000

way2)
hive> select id,name,sal
> from info
> group by id,name,sal;

101 aaa 10000
101 bbb 20000
102 bbb 40000
103 cccc 50000

Way3)
if table has so many columns

hive> create table dummy(line string);
hive> load data inpath
'/user/hive/warehouse/hdp.db/info/dupes'
into table dummy;
hive> insert overwrite table dummy
select line from dummy group by line;

hive> load data inpath
'/user/hive/warehouse/hdp.db/dummy/000000_0'
into table info;

hive> drop table dummy;

________________________________________

hive> select * from emp;
OK
101 vino 26000 m 11
102 Sri 25000 f 11
103 mohan 13000 m 13
104 lokitha 8000 f 12
105 naga 6000 m 13
101 janaki 10000 f 12
201 aaa 30000 m 12
202 bbbb 50000 f 13
203 ccc 10000 f 13
204 ddddd 50000 m 13
304 xxx 70000 m 14
305 yyy 80000 f 15
Time taken: 0.053 seconds
hive>

hive> create table todayemp(id int, name string,
> sal int, sex string, dno int)
> row format delimited
> fields terminated by ',';
OK
Time taken: 0.031 seconds
hive> load data local inpath 'emp2'
> into table todayemp;
Copying data from file:/home/training/emp2
Copying file: file:/home/training/emp2
Loading data to table hdp.todayemp
OK
Time taken: 0.095 seconds
hive> select * from todayemp;
OK
401 aaa 50000 m 11
402 bbbbbb 60000 f 12
403 cc 90000 m 11
Time taken: 0.051 seconds
hive>

appending one table data another table data.

way1)
insert overwrite table emp
select * from (
select * from emp
union all
select * from todayemp) e;

-- if first table has 1lakh and 2nd has 10 rows, just to 10 rows, hive is scanning 1lakh+10 rows. [ bad ]

way2).

if delimiters of both tables is same.

hadoop fs -cp /user/hive/warehouse/hdp.db/todayemp/emp2 /user/hive/warehouse/hdp.db/emp

_______________________________________

union all:

if table schema is different.

tab1 --> id name sal sex dno
tab2 ---> id name dno sex sal

select * from (
select id, name,sal,sex,dno from tab1
union all
select id, name,sal,sex,dno from tab2 ) t;

________________

if tables have different columns.

tab1 --> id, name, sal, sex , dno , desig

tab2 --> id, name, income, gender, city

select * from (
select id, name, sal, sex, dno, desig, 'neevuru' as city from tab1
union all
select id,name,income as sal, gender as sex,
0 as dno , 'nopani' as desig, city
from tab2 ) t;

__________________________________________

Tuesday 5 July 2016

Hive Lab 12 : Loading Array, Struct, Map collection Items

[training@localhost ~]$ cat profile1
101,Venu,btech#mtech,25,hyd
102,Veni,bsc#msc#mtech,26,pune
[training@localhost ~]$

hive> create database pract;
OK
Time taken: 1.798 seconds
hive> use pract;
OK
Time taken: 0.027 seconds
hive> create table profile1(
> id int, name string,
> qual array<string>, age int,
> city string)
> row format delimited
> fields terminated by ','
> collection items terminated by '#';

hive> load data local inpath 'profile1'
> into table profile1;
Copying data from file:/home/training/profile1
Copying file: file:/home/training/profile1
Loading data to table pract.profile1
OK
Time taken: 0.208 seconds
hive> select * from profile1;
OK
101 Venu ["btech","mtech"] 25 hyd
102 Veni ["bsc","msc","mtech"] 26 pune
Time taken: 0.199 seconds
hive>

__________________
[training@localhost ~]$ cat profile2
101,Venu,Vani#25#btech,hyd
102,Varun,Varuna#24#mba,pune
[training@localhost ~]$

hive> create table profile2(
> id int, name string,
> wife struct<name:string,age:int,qual:string>, city string)
> row format delimited
> fields terminated by ','
> collection items terminated by '#';
OK
Time taken: 0.044 seconds
hive> load data local inpath 'profile2'
> into table profile2;
Copying data from file:/home/training/profile2
Copying file: file:/home/training/profile2
Loading data to table pract.profile2
OK
Time taken: 0.1 seconds
hive>
hive> select * from profile2;
OK
101 Venu {"name":"Vani","age":25,"qual":"btech"} hyd
102 Varun {"name":"Varuna","age":24,"qual":"mba"} pune
Time taken: 0.066 seconds
hive> select name, wife.name from profile2;

Venu Vani
Varun Varuna

hive> create table info(id int, name string,
> wname string, wage int, wqual string,
> city string)
> ;
OK
Time taken: 0.039 seconds
hive> insert overwrite table info
> select id, name, wife.name, wife.age,
> wife.qual , city from profile2;

hive> select * from info;
OK
101 Venu Vani 25 btech hyd
102 Varun Varuna 24 mba pune
Time taken: 0.066 seconds
hive>

____________________________________
[training@localhost ~]$ cat profile3
101,p1#1000$p3#2000$p7#4000,hyd
102,p1#1200$p2#5000,del
[training@localhost ~]$

hive> create table trans(cid int,
> plist map<string,int>,
> city string)
> row format delimited
> fields terminated by ','
> collection items terminated by '$'
> map keys terminated by '#';
OK
Time taken: 0.048 seconds
hive> load data local inpath 'profile3'
> into table trans;
Copying data from file:/home/training/profile3
Copying file: file:/home/training/profile3
Loading data to table pract.trans
OK
Time taken: 0.103 seconds

hive> select cid, plist['p1'] , city from trans;

hive>
101 1000 hyd
102 1200 del

hive> create table sales(cid int,
> prid array<string>,
> pr array<int>);
OK
Time taken: 0.05 seconds
hive> insert overwrite table sales
> select cid, map_keys(plist),
> map_values(plist) from trans;

hive> select * from sales;
OK
101 ["p1","p3","p7"] [1000,2000,4000]
102 ["p1","p2"] [1200,5000]
Time taken: 0.056 seconds
hive>
___________________________________

Hive Lab 11 : Json Array Processing

[training@localhost ~]$ cat json3
{"name":"Ravi","qual":["btech","mtech"]}
{"name":"avani","qual":["btech","mtech","mba"]}
{"name":"avinash","qual":["btech","mba"]}
[training@localhost ~]$
hive> create database jsons;
OK
Time taken: 1.747 seconds
hive> use jsons;
OK
Time taken: 0.032 seconds
hive> create table raw(line string);
OK
Time taken: 0.382 seconds
hive>

hive> load data local inpath 'json3'
> into table raw;

hive> select * from raw;
OK
{"name":"Ravi","qual":["btech","mtech"]}
{"name":"avani","qual":["btech","mtech","mba"]}
{"name":"avinash","qual":["btech","mba"]}
Time taken: 0.233 seconds
hive>

hive> create table raw2(name string,
> qual string);
OK
Time taken: 0.047 seconds
hive> insert overwrite table raw2
> select x.* from raw
> lateral view
> json_tuple(line,'name','qual') x
> as n,q;

hive> select * from raw2;
OK
Ravi ["btech","mtech"]
avani ["btech","mtech","mba"]
avinash ["btech","mba"]
Time taken: 0.064 seconds
hive> create table raw3(name string,
> qual array<string>);
OK
Time taken: 0.037 seconds
hive> insert overwrite table raw3
> select name, split(qual,',') from raw2;

hive> select * from raw3;
OK
Ravi ["[\"btech\"","\"mtech\"]"]
avani ["[\"btech\"","\"mtech\"","\"mba\"]"]
avinash ["[\"btech\"","\"mba\"]"]
Time taken: 0.063 seconds
hive> create table raw4(name string, qual string);
OK

hive> insert overwrite table raw4
> select name, myq from raw3
> lateral view explode(qual) q as myq;

> create table info(name string,
> qual string);
OK
Time taken: 0.039 seconds
hive> insert overwrite table info
> select name, split(qual,'"')[1]
> from raw4;

hive> select * from info;
OK
Ravi btech
Ravi mtech
avani btech
avani mtech
avani mba
avinash btech
avinash mba
Time taken: 0.065 seconds
hive>

Data science Software Course Training in Ameerpet Hyderabad

Saturday 23 July 2016

Saturday 9 July 2016

Wednesday 6 July 2016

Tuesday 5 July 2016