Learn How to Read Hudi Tables on S3 Locally in Your PySpark Environment | Essential Packages You Need to Use¶
Define Imports¶
In [1]:
from pyspark.sql import SparkSession
import os, sys
# Set Java Home environment variable if needed
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@11"
HUDI_VERSION = '0.14.0'
SPARK_VERSION = '3.4'
Create Spark Session¶
In [4]:
SUBMIT_ARGS = f"--packages org.apache.hudi:hudi-spark{SPARK_VERSION}-bundle_2.12:{HUDI_VERSION},org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.773 pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS
os.environ['PYSPARK_PYTHON'] = sys.executable
# Spark session
spark = SparkSession.builder \
.config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \
.config('spark.sql.extensions', 'org.apache.spark.sql.hudi.HoodieSparkSessionExtension') \
.config('className', 'org.apache.hudi') \
.config("fs.s3a.prefetch.enable", "false") \
.config("fs.s3a.experimental.fadvise", "random") \
.config('spark.sql.hive.convertMetastoreParquet', 'false') \
.config("spark.hadoop.fs.s3a.access.key", os.getenv("ACCESS_KEY")) \
.config("spark.hadoop.fs.s3a.secret.key", os.getenv("SECRET_KEY")) \
.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
.config("spark.hadoop.fs.s3a.endpoint", "https://s3.amazonaws.com") \
.config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
.getOrCreate()
Read your Hudi tables¶
In [5]:
path = "s3a://soumilshah-dev-1995/tmp/people/"
df = spark.read.format("hudi") \
.load(path)
df.show(truncate=True)
In [ ]:
In [ ]:
In [ ]:
No comments:
Post a Comment