This repository contains Hive scripts and Docker setup instructions for creating and managing partitioned tables in Hive. The lab demonstrates table creation, data loading, partition management, and executing analytical queries using HiveQL.
docker pull macio232/hadoop-pseudo-distributed-mode
docker run -p 9870:9870 -p 8088:8088 -it --name=myHadoop macio232/hadoop-pseudo-distributed-modedocker container start -i myHadoopmkdir /testFrom your Windows Terminal, open the DDBMS folder and run:
docker cp student_results.csv myHadoop:/test
docker cp students.csv myHadoop:/test
docker cp courses.csv myHadoop:/testls /testhiveCREATE DATABASE education_db;
USE education_db;CREATE TABLE IF NOT EXISTS education_db.student_results (
student_id INT,
subject_code STRING,
marks INT,
grade STRING
)
PARTITIONED BY (
exam_year INT,
exam_session STRING
)
STORED AS PARQUET
LOCATION '/Test/Result';CREATE TABLE IF NOT EXISTS education_db.result_tmp (
student_id INT,
subject_code STRING,
marks INT,
grade STRING,
exam_year INT,
exam_session STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '/Test/Result_Temp';LOAD DATA LOCAL INPATH '/test/student_results.csv' INTO TABLE education_db.result_tmp;SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict;INSERT OVERWRITE TABLE education_db.student_results PARTITION(exam_year, exam_session)
SELECT * FROM education_db.result_tmp
WHERE exam_year IS NOT NULL AND exam_session IS NOT NULL;SHOW PARTITIONS education_db.student_results;
MSCK REPAIR TABLE education_db.student_results;
SELECT * FROM education_db.student_results;DROP TABLE education_db.result_tmp;ALTER TABLE student_results
ADD PARTITION (exam_year=2020, exam_session='Fall')
LOCATION '/Test/Result/exam_year=2020/exam_session=Fall';
SHOW PARTITIONS student_results;
MSCK REPAIR TABLE student_results;INSERT INTO TABLE education_db.student_results
PARTITION (exam_year=2020, exam_session='Fall')
VALUES
(1071,'CSE101',92,'A+'),
(1072,'CSE102',78,'B+'),
(1073,'CSE103',85,'A'),
(1074,'CSE104',67,'B');
MSCK REPAIR TABLE education_db.student_results;
SELECT * FROM education_db.student_results WHERE exam_year=2020 AND exam_session='Fall';ALTER TABLE education_db.student_results
DROP PARTITION (exam_year=2020, exam_session='Fall');
SHOW PARTITIONS education_db.student_results;
MSCK REPAIR TABLE education_db.student_results;CREATE TABLE IF NOT EXISTS education_db.students (
student_id INT,
student_name STRING,
dob STRING,
department STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/Test/Student';
LOAD DATA LOCAL INPATH '/test/students.csv' INTO TABLE education_db.students;
SELECT * FROM education_db.students;CREATE TABLE IF NOT EXISTS education_db.courses (
course_id INT,
course_name STRING,
credits INT,
department STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/Test/Course';
LOAD DATA LOCAL INPATH '/test/courses.csv' INTO TABLE education_db.courses;
SELECT * FROM education_db.courses;SELECT
sr.student_id,
s.student_name,
s.dob,
s.department,
sr.subject_code,
sr.marks,
sr.grade,
sr.exam_year,
sr.exam_session
FROM education_db.student_results sr
JOIN education_db.students s
ON sr.student_id=s.student_id
WHERE sr.exam_year=2025 AND sr.exam_session='Fall';SELECT
s.student_name,
c.course_name,
sr.marks
FROM education_db.student_results sr
JOIN education_db.students s
ON sr.student_id = s.student_id
JOIN education_db.courses c
ON sr.subject_code = c.course_name
WHERE s.department = 'Computer Science';SELECT
s.department,
AVG(sr.marks) AS avg_marks
FROM education_db.student_results sr
JOIN education_db.students s
ON sr.student_id = s.student_id
WHERE sr.exam_year = 2025
GROUP BY s.department;WITH max_marks_cte AS (
SELECT MAX(marks) AS max_marks
FROM education_db.student_results
WHERE exam_year = 2025 AND exam_session = 'Fall'
)
SELECT
sr.student_id,
s.student_name,
sr.subject_code,
sr.marks
FROM education_db.student_results sr
JOIN max_marks_cte mm
ON sr.marks = mm.max_marks
JOIN education_db.students s
ON sr.student_id = s.student_id
WHERE sr.exam_year = 2025 AND sr.exam_session = 'Fall';DROP TABLE education_db.students;
DROP DATABASE education_db;
SHOW TABLES IN education_db;hdfs dfs -ls /Test/Result/exam_year=2020
hdfs dfs -ls /Test/Result/exam_year=2020/exam_session='Fall'/000000_o