Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Saturday 23 July 2016

Pig Lab 11: Executing Scripts and UDFs.



Excecuting scripts(pig) :-
____________________________

   three commands (operators) are used to execute scripts.


  i) pig
  ii) exec.  
  iii) run.

i) Pig:-
  
   to execute script from Command Line(operating sys).

  $ pig  script1.pig
--> script will be executed, 
   but relation aliases are not available with grunt shell. so that we  can not reuse them.

ii) exec: 

 --> to execute script from grunt shell.
 still aliases will not be available with grunt.
   so "No reuse".

grunt> exec  script1.pig

__________________________________

iii) run:

 ---> to execute script from grunt shell, 
   Aliases will be available with grunt. So we can reuse them.

grunt> run script1.pig
______________________________

run:
  adv --> aliases will be available.
  disadv --> overriding previous aliases with same name.

exec:
   adv --> aliases will not be available.
    so no -0verriding.
  disadv --> no reusability.

pig :
  adv ---> 
      -- used for production operators.
     --- can be called other evenvironments , like shell script.

 disadv --> aliases will not be reflected into grunt.

_________________________________

Pig Udfs:
_____________j

   User defined functions.

   adv:
   i) Custom functionality.
   ii) Reusabilty .

 Udf life cycle:

step 1) Develop UDF class
step 2) Export into jar file.
step 3) register jar file into pig.
step 4) create temporary function for the  UDF class.
step 5) call the function.
__________________________

[training@localhost ~]$ cat > samp1
101,ravi
102,mani
103,Deva
104,Devi
105,AmAr
[training@localhost ~]$ hadoop fs -copyFromLocal samp1  piglab
[training@localhost ~]$ 


grunt> s = load 'piglab/samp1' 
>>    using PigStorage(',')
>>   as (id:int, name:chararray);

eclipse navigations:
 i) create java project.

 file --> new --> java project.
  ex: PigDemo


 ii) create package>

  pigDemo ---> new ---> package.


    ex:    pig.test


  iii) configure pig jar.

 src -- build path --> configure build path --> libraries ---> add external jars.
  
  /usr/lib/pig/pig-core.jar


  iv) create jar class

 pig.test ---> new --> class

   FirstUpper


  v)  export into jar.

pigdemo ---> export --> java --java jar --
    /home/training/Desktop/pigudfs.jar

_____________________
package pig.test;
import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;

public class FirstUpper  extends EvalFunc<String> 
{
public String   exec(Tuple v) throws IOException
{    //   raVI  --> Ravi

 String name = (String)v.get(0);
String fc = name.substring(0,1).toUpperCase();
String rc = name.substring(1).toLowerCase();
String n = fc+rc;
return n;
}

}

grunt> register  Desktop/pigudfs.jar;

grunt> define cconvert pig.test.FirstUpper();

grunt> r = foreach s generate
>>       id, cconvert(name) as name;

grunt> dump r;

______________________________

[training@localhost ~]$ cat > f1
100     200     120
300     450     780
120     56      90
1000    3456    789
[training@localhost ~]$ hadoop fs -copyFromLocal f1 piglab
[training@localhost ~]$ 

task:
  write udf , to find max value for a row.


package pig.test;

import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;

public class RowMax extends EvalFunc<Integer> 
{
public Integer  exec(Tuple v) throws IOException
{
int a = (Integer)v.get(0);
        int b = (Integer)v.get(1);
        int c = (Integer)v.get(2);
int big =0;  
if (a>big) big=a;
if (b>big) big=b;
if (c>big) big=c;
return  new Integer(big);
}

}

export into jar.


    /home/training/Desktop/pigudfs.jar


grunt> s1 = load 'piglab/f1' 
>>     as (a:int, b:int, c:int);
grunt> register Desktop/pigudfs.jar;
grunt> define rowmax pig.test.RowMax();
grunt> r1 = foreach s1 generate  *,
>>           rowmax(*) as rmax;
grunt> dump r1
(100,200,120,200)
(300,450,780,780)
(120,56,90,120)
(1000,3456,789,3456)

[training@localhost ~]$ cat f2
-10,-30,-56,-23,-21,-5
1,2,3,45,67,9
[training@localhost ~]$ hadoop fs -copyFromLocal f2 piglab
[training@localhost ~]$ 


package pig.test;

import java.io.IOException;
import java.util.List;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;

public class DynRowMax  extends EvalFunc<Integer> 
{
public Integer  exec(Tuple v) throws IOException
{
List<Object>  olist  =  v.getAll();
int max = 0;//  10  30  20
int cnt=0;
for( Object o : olist){
   cnt++;
   int val= (Integer)o;
   if (cnt==1)  max = val;
                max = Math.max(val, max);
}
return new Integer(max);
}

}

export into jar   /home/training/Desktop/pigudfs.jar


 grunt> register Desktop/pigudfs.jar;
grunt> define dynmax pig.test.DynRowMax();
grunt> ss = load 'piglab/f2'    
>>    using PigStorage(',')
>>   as (a:int, b:int, c:int, d:int, 
>>     e:int, f:int);
grunt> define  rmax pig.test.RowMax();
grunt> rr = foreach ss generate *,
>>      dynmax(*) as max;
grunt> dump rr



























































Pig Lab10 : Quarterly sales Report and comparing sales with Previous Quarter. Cross .

grunt> raw = load 'piglab/sales'
>>     using PigStorage(',')
>>     as (dt:chararray, amt:int);
grunt> msales = foreach raw generate  
>>      SUBSTRING(dt,0,2)  as m, amt ; 
grunt> describe msales;
msales: {m: chararray,amt: int}
grunt> msales = foreach msales generate
>>         (int)m, amt;
grunt> describe msales;                
msales: {m: int,amt: int}
grunt> dump msales;

grunt> qsales = foreach msales generate
>>       (m<4 ? 1:
>>         (m<7 ? 2:
>>           (m<10 ? 3:4))) as q, amt ;
grunt> dump qsales

grunt> qsales = foreach msales generate
>>       (m<4 ? 1:
>>         (m<7 ? 2:
>>           (m<10 ? 3:4))) as q, amt ;
grunt> dump qsales

grunt> rep2 = foreach rep generate *;
grunt> cr = cross rep, rep2;
grunt> describe cr
cr: {rep::q: int,rep::tot: long,rep2::q: int,rep2::tot: long}
grunt> cr = foreach cr generate 
>>     rep::q as q1, rep2::q as q2,
>>     rep::tot as tot1, rep2::tot as tot2;

grunt> describe cr;
cr: {q1: int,q2: int,tot1: long,tot2: long}
grunt> dump cr;

grunt> fltd = filter cr by ((q1-q2)==1);
grunt> dump fltd;

grunt> res = foreach fltd generate *,
>>        ((tot1-tot2)*100)/tot2 as pgrowth;
grunt> dump res

(3,2,70000,355000,-80)
(2,1,355000,140000,153)
(4,3,290000,70000,314)

_________________________________





pig lab9 : Language neutralization

Laguage neutralization.

 ex: hindi to english.


[training@localhost ~]$ gedit comments
[training@localhost ~]$ gedit dictionary
[training@localhost ~]$ hadoop fs -copyFromLocal comments pdemo
[training@localhost ~]$ hadoop fs -copyFromLocal dictionary pdemo
[training@localhost ~]$ 

grunt> comms = load 'pdemo/comments'
>>     as (line:chararray);
grunt> dict = load 'pdemo/dictionary'
>>     using PigStorage(',')
>>    as (hindi:chararray, eng:chararray);
grunt> words = foreach comms
>>    generate FLATTEN(TOKENIZE(line)) as word;
grunt> 


grunt> describe words
words: {word: chararray}
grunt> describe dict
dict: {hindi: chararray,eng: chararray}
grunt> j = join words by word left outer, 
>>    dict by hindi;
grunt> jj = foreach j generate words::word as word, dict::hindi as hindi, dict::eng as eng;
grunt> dump jj
(is,,)
(is,,)
(is,,)
(xyz,,)
(acha,acha,good)
(bacha,bacha,small)
(lucha,lucha,worst)
(hadoop,,)
(oracle,,)
____________________

grunt> rset = foreach jj generate 
>>    (hindi is not null ? eng:word) as word;
grunt> dump rset

(is)
(is)
(is)
(xyz)
(good)
(small)
(worst)
(hadoop)
(oracle)

__________________________

Then we can apply sentiment on this restult set.

__________________________-












Saturday 9 July 2016

Hive Lab 14: Joins and Cartisian Product


 Joins 
______________

   used to collect data from two or more tables.

    joins are two types.

 i) Inner Joins
 ii) External Joins.

Inner Joins:
   matching rows with given join condition.

   ex:
  tab1 ---> name,city
___________
  A,hyd
  B,del
  c,hyd
  d,del
  e,pune
  f,mumbai   

tab2 ---> city, dept
_________
  hyd,production
  del,hr
  pune,accounts
  chennai,admin
_______________

 inner join --> based on city.


 a,hyd,hyd,production
 B,del,del,hr
 c,hyd,hyd,production
 d,del,del,hr
 e,pune,pune,accounts

outer join -->
   matching records and non-matching records.


 outer joins are 3 types.
  i) left outer join.
 ii) right outer join
 iii) full outer join.

left outer join:

   matchings + non matchings of left side.
   ( complete presence from left side table).
   
 tabx:--> id, dno
____________
 101,11
 102,14
___________

 taby ---> dno,loc
 _____________
 11,hyd
 13,del
 _____________--


 tabx  lefter outer join taby --> based on dno.


   101,11,11,hyd
   102,14,null,null

 right outer join -->
   matchings and non matchings of right side table.

   (complete presence from right side table ).

 tabx right outer join taby -->based on dno.

   101,11,11,hyd
   null,null,13,del

full outer join --->
  matchings, non matchings of left side,
 and  non matchings of right side table.

 tabx full outer join taby ---> based on dno.

   101,11,11,hyd
   102,14,null,null
   null,null,13,del

___________________________________


[training@localhost ~]$ cat emp
101,vino,26000,m,11
102,Sri,25000,f,11
103,mohan,13000,m,13
104,lokitha,8000,f,12
105,naga,6000,m,13
101,janaki,10000,f,12
201,aaa,30000,m,12
202,bbbb,50000,f,13
203,ccc,10000,f,13
204,ddddd,50000,m,13
304,xxx,70000,m,14
305,yyy,80000,f,15
[training@localhost ~]$ cat dept
11,marketing,hyd
12,hr,del
13,finance,hyd
20,prod,hyd
21,admin,chennai
[training@localhost ~]$ 


hive> create database joinsdb;
OK
Time taken: 1.761 seconds
hive> use joinsdb;
OK
Time taken: 0.027 seconds
hive> create table emp(id int, name string, 
    >   sal int, sex string, dno int)
    > row format delimited 
    >   fields terminated by ',';
OK
Time taken: 0.297 seconds
hive> load data local inpath 'emp' into table emp;
Copying data from file:/home/training/emp
Copying file: file:/home/training/emp
Loading data to table joinsdb.emp
OK
Time taken: 0.207 seconds
hive> select * from emp;
OK
101     vino    26000   m       11
102     Sri     25000   f       11
103     mohan   13000   m       13
104     lokitha 8000    f       12
105     naga    6000    m       13
101     janaki  10000   f       12
201     aaa     30000   m       12
202     bbbb    50000   f       13
203     ccc     10000   f       13
204     ddddd   50000   m       13
304     xxx     70000   m       14
305     yyy     80000   f       15
Time taken: 0.194 seconds
hive> 

hive> load data local inpath 'dept'
    >  into table dept;
Copying data from file:/home/training/dept
Copying file: file:/home/training/dept
Loading data to table joinsdb.dept
OK
Time taken: 0.12 seconds
hive> select * from dept;
OK
11      marketing       hyd
12      hr      del
13      finance hyd
20      prod    hyd
21      admin   chennai
Time taken: 0.068 seconds
hive> 

  Inner Join:

hive> select id,name,sal,sex,l.dno,r.dno,
    >    dname,loc
    >  from  emp l join dept r
    >  on (l.dno=r.dno);

101     vino    26000   m       11      11      marketing hyd
102     Sri     25000   f       11      11      marketing hyd
104     lokitha 8000    f       12      12      hrdel
101     janaki  10000   f       12      12      hrdel
201     aaa     30000   m       12      12      hrdel
103     mohan   13000   m       13      13      finance   hyd
105     naga    6000    m       13      13      finance   hyd
202     bbbb    50000   f       13      13      finance   hyd
203     ccc     10000   f       13      13      finance   hyd
204     ddddd   50000   m       13      13      finance   hyd
Time taken: 14.11 seconds
hive> 


Left outer Join :


hive> select id,name,sal,sex,l.dno,r.dno,
    >    dname,loc                       
    >  from  emp l left outer join dept r
    >  on (l.dno=r.dno);                 



101     vino    26000   m       11      11      marketing hyd
102     Sri     25000   f       11      11      marketing hyd
104     lokitha 8000    f       12      12      hrdel
101     janaki  10000   f       12      12      hrdel
201     aaa     30000   m       12      12      hrdel
103     mohan   13000   m       13      13      finance   hyd
105     naga    6000    m       13      13      finance   hyd
202     bbbb    50000   f       13      13      finance   hyd
203     ccc     10000   f       13      13      finance   hyd
204     ddddd   50000   m       13      13      finance   hyd
304     xxx     70000   m       14      NULL    NULL      NULL
305     yyy     80000   f       15      NULL    NULL      NULL
Time taken: 12.786 seconds
hive> 


Right outer Join:

hive> select id,name,sal,sex,l.dno,r.dno,
    >    dname,loc                       
    >  from  emp l right outer join dept r
    >  on (l.dno=r.dno);  

101     vino    26000   m       11      11      marketing hyd
102     Sri     25000   f       11      11      marketing hyd
104     lokitha 8000    f       12      12      hrdel
101     janaki  10000   f       12      12      hrdel
201     aaa     30000   m       12      12      hrdel
103     mohan   13000   m       13      13      finance   hyd
105     naga    6000    m       13      13      finance   hyd
202     bbbb    50000   f       13      13      finance   hyd
203     ccc     10000   f       13      13      finance   hyd
204     ddddd   50000   m       13      13      finance   hyd
NULL    NULL    NULL    NULL    NULL    20      prod      hyd
NULL    NULL    NULL    NULL    NULL    21      admin     chennai
Time taken: 12.429 seconds
hive> 

Full outer Join;

hive> select id,name,sal,sex,l.dno,r.dno, 
    >    dname,loc                        
    >  from  emp l full outer join dept r 
    >  on (l.dno=r.dno);   

   

Denormalizing:

hive> create table info(id int, name string, 
    >   sal int, sex string , dname string, 
    >   loc string);
OK
Time taken: 0.04 seconds
hive> insert overwrite table info
    >   select id, name, sal, sex, dname, loc
    >  from emp l full outer join dept r
    >  on (l.dno=r.dno);

           

hive> select * from info;
OK
101     vino    26000   m       marketing       hyd
102     Sri     25000   f       marketing       hyd
104     lokitha 8000    f       hr      del
101     janaki  10000   f       hr      del
201     aaa     30000   m       hr      del
103     mohan   13000   m       finance hyd
105     naga    6000    m       finance hyd
202     bbbb    50000   f       finance hyd
203     ccc     10000   f       finance hyd
204     ddddd   50000   m       finance hyd
304     xxx     70000   m       NULL    NULL
305     yyy     80000   f       NULL    NULL
NULL    NULL    NULL    NULL    prod    hyd
NULL    NULL    NULL    NULL    admin   chennai
Time taken: 0.071 seconds
hive> 

_________________

Task:

hive> create table projects(dno1 int, dno2 int,
    >   sal int);
OK
Time taken: 0.072 seconds
hive> insert overwrite table projects
    >   select l.dno, r.dno, sal from 
    >   emp l full outer join dept r 
    >   on (l.dno=r.dno);

hive> create table proj(stat string, sal int);
hive> insert overwrite table proj
    >  select 
    > if(dno1 is not null and dno2 is not null,
    > 'Working',
    >  if(dno2 is null,'BenchTeam','BenchProj')),
    >  sal from projects;

hive> select * from proj;
OK
Working 26000
Working 25000
Working 8000
Working 10000
Working 30000
Working 13000
Working 6000
Working 50000
Working 10000
Working 50000
BenchTeam       70000
BenchTeam       80000
BenchProj       NULL
BenchProj       NULL
Time taken: 0.087 seconds
hive> 

hive> insert overwrite table proj
    >  select stat, if(sal is null,0,sal)
    >  from proj;


hive> create table summary(stat string, 
    >    totsal int, cnt int);
OK
Time taken: 0.042 seconds
hive> insert overwrite table summary
    >   select stat, sum(sal), count(*)
    >  from proj
    >    group by stat;

hive> select * from summary;
OK
BenchProj       0       2
BenchTeam       150000  2
Working 228000  10


___________________________

[training@localhost ~]$ cat trans
01/01/2010,30000
01/07/2010,40000
01/08/2010,30000
01/26/2010,40000
02/01/2010,30000
02/07/2010,40000
02/09/2010,30000
02/28/2010,40000
03/01/2010,80000
03/07/2010,40000
04/01/2010,30000
04/17/2010,70000
04/18/2010,80000
04/26/2010,90000
05/01/2010,10000
05/07/2010,20000
05/08/2010,30000
05/26/2010,60000
06/01/2010,30000
06/17/2010,50000
07/01/2010,30000
07/07/2010,40000
08/08/2010,10000
08/26/2010,20000
09/01/2010,90000
09/07/2010,40000
09/18/2010,30000
09/30/2010,40000
10/01/2010,30000
10/07/2010,40000
10/08/2010,30000
11/26/2010,40000
11/01/2010,30000
12/07/2010,40000
12/08/2010,30000
12/26/2010,40000
[training@localhost ~]$ 

hive> create table rawsales(dt string, 
    >  amt int)
    row format delimited
     fields terminated by ',';
OK
Time taken: 0.057 seconds
hive> load data local inpath 'trans'
    >  into table rawsales;
Copying data from file:/home/training/trans
Copying file: file:/home/training/trans
Loading data to table joinsdb.rawsales
OK
Time taken: 0.514 seconds
hive> 


create table raw2(dt array<string> , amt int);

insert overwrite table raw2
  select split(dt,'/') , amt from rawsales;


hive> select * from raw2 limit 3;
OK
["01","01","2010"]      30000
["01","07","2010"]      40000
["01","08","2010"]      30000
Time taken: 0.061 seconds
hive> insert overwrite table sales
    >   select 
    > concat(dt[2],'-',dt[0],'-',dt[1]), amt
    >  from raw2;

hive> create table sales2(mon int, amt int);
OK
Time taken: 0.044 seconds
hive> insert overwrite table sales2
    >   select month(dt), amt from sales;


hive> create table rep1(mon int, tot int);
OK
Time taken: 0.034 seconds
hive> insert overwrite table rep1
    >   select mon, sum(amt) from sales2
    >    group by mon;



hive> create table carts(m1 int, m2 int, 
    >   tot1 int, tot2 int);             
OK
Time taken: 0.044 seconds
hive> insert overwrite table carts
    >   select l.mon , r.mon , l.tot, r.tot
    >  from rep1 l join rep1 r;


hive> insert overwrite table carts
    >  select * from carts
    >    where (m1-m2)=1;

hive> select * from carts;
OK
12      11      110000  70000
7       6       70000   80000
2       1       140000  140000
8       7       30000   70000
4       3       270000  120000
9       8       200000  30000
10      9       100000  200000
5       4       120000  270000
11      10      70000   100000
3       2       120000  140000
6       5       80000   120000


hive> alter table carts add columns(per int);
OK
Time taken: 0.067 seconds
hive> insert overwrite table carts
    >   select m1,m2,tot1,tot2,
    >     ((tot1-tot2)*100)/tot2 from carts;

hive> select * from carts;
OK
12      11      110000  70000   57
7       6       70000   80000   -12
2       1       140000  140000  0
8       7       30000   70000   -57
4       3       270000  120000  125
9       8       200000  30000   566
10      9       100000  200000  -50
5       4       120000  270000  -55
11      10      70000   100000  -30
3       2       120000  140000  -14
6       5       80000   120000  -33
Time taken: 0.061 seconds
hive> 



























   
  

Wednesday 6 July 2016

Hive Lab 13 : Eliminating Duplicates and Unions (merging)


[training@localhost ~]$ cat dupes
101,aaa,10000
101,bbb,20000
101,aaa,10000
101,aaa,10000
101,aaa,10000
102,bbb,40000
103,cccc,50000
102,bbb,40000
102,bbb,40000
[training@localhost ~]$ 

hive> create database hdp;
OK
Time taken: 1.702 seconds
hive> use hdp;
OK
Time taken: 0.018 seconds
hive> create table info(id int, name string, 
    >   sal int)
    >  row format delimited fields terminated 
    >  by ',';
OK
Time taken: 0.439 seconds
hive> 

hive> load data local inpath 'dupes'
    >  into table info;
Copying data from file:/home/training/dupes
Copying file: file:/home/training/dupes
Loading data to table hdp.info
OK
Time taken: 0.209 seconds
hive> select * from info;
OK
101     aaa     10000
101     bbb     20000
101     aaa     10000
101     aaa     10000
101     aaa     10000
102     bbb     40000
103     cccc    50000
102     bbb     40000
102     bbb     40000
Time taken: 0.201 seconds
hive> 
hive> select distinct(id),name,sal
    >   from info;


101     aaa     10000
101     bbb     20000
102     bbb     40000
103     cccc    50000

way2)
hive> select id,name,sal           
    >   from info           
    >  group by id,name,sal;

101     aaa     10000
101     bbb     20000
102     bbb     40000
103     cccc    50000

Way3)
 if table has so many columns

 hive> create table dummy(line string);
 hive> load data inpath
  '/user/hive/warehouse/hdp.db/info/dupes'
  into table dummy;
 hive> insert overwrite table dummy
    select line from dummy group by line;

 hive> load data inpath
  '/user/hive/warehouse/hdp.db/dummy/000000_0'
  into table info;

  hive> drop table dummy;

________________________________________

 hive> select * from emp;
OK
101     vino    26000   m       11
102     Sri     25000   f       11
103     mohan   13000   m       13
104     lokitha 8000    f       12
105     naga    6000    m       13
101     janaki  10000   f       12
201     aaa     30000   m       12
202     bbbb    50000   f       13
203     ccc     10000   f       13
204     ddddd   50000   m       13
304     xxx     70000   m       14
305     yyy     80000   f       15
Time taken: 0.053 seconds
hive> 


hive> create table todayemp(id int, name string, 
    >  sal int, sex string, dno int) 
    > row format delimited
    >   fields terminated by ',';
OK
Time taken: 0.031 seconds
hive> load data local inpath 'emp2'
    >  into table todayemp;
Copying data from file:/home/training/emp2
Copying file: file:/home/training/emp2
Loading data to table hdp.todayemp
OK
Time taken: 0.095 seconds
hive> select * from todayemp;
OK
401     aaa     50000   m       11
402     bbbbbb  60000   f       12
403     cc      90000   m       11
Time taken: 0.051 seconds
hive> 


appending  one table data another table data.


way1)
  insert overwrite table emp
    select * from (
      select * from emp
            union all
      select * from todayemp) e;


 -- if first table has 1lakh and 2nd has 10 rows, just to 10 rows, hive is scanning 1lakh+10 rows.  [ bad ]

 way2).

  if delimiters of both tables is same.


  hadoop fs -cp /user/hive/warehouse/hdp.db/todayemp/emp2   /user/hive/warehouse/hdp.db/emp
   
_______________________________________




 union all:

    if table schema is different.


  tab1 --> id name  sal   sex dno
 tab2  ---> id name dno sex sal

  select * from (
      select id, name,sal,sex,dno from tab1
           union all
   select id, name,sal,sex,dno from tab2 ) t;


________________

 if tables have different columns.

 tab1 --> id, name,  sal, sex , dno , desig

 tab2 --> id, name, income, gender, city



  select * from (
    select id, name, sal, sex, dno, desig, 'neevuru' as city from  tab1
           union all
 select id,name,income as sal, gender as sex,
   0 as dno , 'nopani' as desig, city 
    from tab2 ) t;

__________________________________________


























Tuesday 5 July 2016

Hive Lab 12 : Loading Array, Struct, Map collection Items

[training@localhost ~]$ cat profile1
101,Venu,btech#mtech,25,hyd
102,Veni,bsc#msc#mtech,26,pune
[training@localhost ~]$ 



hive> create database pract;
OK
Time taken: 1.798 seconds
hive> use pract;
OK
Time taken: 0.027 seconds
hive> create table profile1(
    >   id int, name string, 
    >     qual  array<string>, age int,
    >     city string) 
    > row format delimited        
    >     fields terminated by ','
    >     collection items terminated by '#';

hive> load data local inpath 'profile1'
    >   into table profile1;
Copying data from file:/home/training/profile1
Copying file: file:/home/training/profile1
Loading data to table pract.profile1
OK
Time taken: 0.208 seconds
hive> select * from profile1;
OK
101     Venu    ["btech","mtech"]       25      hyd
102     Veni    ["bsc","msc","mtech"]   26      pune
Time taken: 0.199 seconds
hive> 

__________________
[training@localhost ~]$ cat profile2
101,Venu,Vani#25#btech,hyd
102,Varun,Varuna#24#mba,pune
[training@localhost ~]$ 

hive> create table profile2(
    >  id int, name string, 
    >   wife struct<name:string,age:int,qual:string>, city string)
    >   row format delimited 
    >     fields terminated by ','
    >     collection items terminated by '#';
OK
Time taken: 0.044 seconds
hive> load data local inpath 'profile2'
    >   into table profile2;
Copying data from file:/home/training/profile2
Copying file: file:/home/training/profile2
Loading data to table pract.profile2
OK
Time taken: 0.1 seconds
hive> 
hive> select * from profile2;
OK
101     Venu    {"name":"Vani","age":25,"qual":"btech"}   hyd
102     Varun   {"name":"Varuna","age":24,"qual":"mba"}   pune
Time taken: 0.066 seconds
hive> select name, wife.name from profile2;

Venu    Vani
Varun   Varuna

hive> create table info(id int, name string, 
    >   wname string, wage int, wqual string, 
    >   city string)
    > ;
OK
Time taken: 0.039 seconds
hive> insert overwrite table info
    >   select id, name, wife.name, wife.age,
    >   wife.qual , city  from profile2;

hive> select * from info;
OK
101     Venu    Vani    25      btech   hyd
102     Varun   Varuna  24      mba     pune
Time taken: 0.066 seconds
hive> 

____________________________________
[training@localhost ~]$ cat profile3
101,p1#1000$p3#2000$p7#4000,hyd
102,p1#1200$p2#5000,del
[training@localhost ~]$ 

hive> create table  trans(cid int, 
    >   plist map<string,int>,
    >  city string)
    > row format delimited 
    >    fields terminated by ','
    >    collection items terminated by '$'
    >   map keys terminated by '#';
OK
Time taken: 0.048 seconds
hive> load data local inpath 'profile3'
    >   into table trans;
Copying data from file:/home/training/profile3
Copying file: file:/home/training/profile3
Loading data to table pract.trans
OK
Time taken: 0.103 seconds

hive> select cid, plist['p1'] , city from trans;

hive>
101     1000    hyd
102     1200    del


hive> create table sales(cid int, 
    >    prid  array<string>,
    >   pr array<int>);
OK
Time taken: 0.05 seconds
hive> insert overwrite table sales
    >   select cid, map_keys(plist),
    >    map_values(plist)  from trans;


hive> select * from sales;
OK
101     ["p1","p3","p7"]        [1000,2000,4000]
102     ["p1","p2"]     [1200,5000]
Time taken: 0.056 seconds
hive> 
___________________________________









Hive Lab 11 : Json Array Processing

[training@localhost ~]$ cat json3
{"name":"Ravi","qual":["btech","mtech"]}
{"name":"avani","qual":["btech","mtech","mba"]}
{"name":"avinash","qual":["btech","mba"]}
[training@localhost ~]$ 
hive> create database jsons;
OK
Time taken: 1.747 seconds
hive> use jsons;
OK
Time taken: 0.032 seconds
hive> create table raw(line string);
OK
Time taken: 0.382 seconds
hive>  

hive>  load data local inpath 'json3'
    >  into table raw;

hive> select * from raw;  
OK
{"name":"Ravi","qual":["btech","mtech"]}
{"name":"avani","qual":["btech","mtech","mba"]}
{"name":"avinash","qual":["btech","mba"]}
Time taken: 0.233 seconds
hive> 

hive> create table raw2(name string, 
    >   qual  string);
OK
Time taken: 0.047 seconds
hive> insert overwrite table raw2
    >   select x.* from raw
    > lateral view
    >  json_tuple(line,'name','qual') x
    > as n,q;

hive> select * from raw2;
OK
Ravi    ["btech","mtech"]
avani   ["btech","mtech","mba"]
avinash ["btech","mba"]
Time taken: 0.064 seconds
hive> create table raw3(name string, 
    >   qual  array<string>);
OK
Time taken: 0.037 seconds
hive> insert overwrite table raw3
    >   select name, split(qual,',') from raw2;

hive> select * from raw3;
OK
Ravi    ["[\"btech\"","\"mtech\"]"]
avani   ["[\"btech\"","\"mtech\"","\"mba\"]"]
avinash ["[\"btech\"","\"mba\"]"]
Time taken: 0.063 seconds
hive> create table raw4(name string, qual string);
OK

hive> insert overwrite table raw4
    >   select name, myq from raw3
    > lateral view explode(qual) q as myq;

 > create table info(name string, 
    >    qual string);
OK
Time taken: 0.039 seconds
hive> insert overwrite table info
    >   select name, split(qual,'"')[1]
    >  from raw4;

hive> select * from info;
OK
Ravi    btech
Ravi    mtech
avani   btech
avani   mtech
avani   mba
avinash btech
avinash mba
Time taken: 0.065 seconds
hive>