Summaries/Apache/Apache Spark/_ jdbc.md

1.3 KiB

title updated created
# jdbc 2022-04-03 15:16:26Z 2021-05-04 14:58:11Z

method a load drivers

import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars file:/home/john/opt/jars/postgresql-42.2.5.jar pyspark-shell'

method b load drivers

pyspark \
--packages org.postgresql:postgresql:42.2.5 \
--driver-class-path /home/john/opt/jars/postgresql-42.2.5.jar

alone driver-class-path is also OK

from pyspark.sql import DataFrameReader, SparkSession

spark = SparkSession.builder \
    .master("local") \
    .appName("jdbc data sources") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

method 1

df_company = (
    spark.read.format("jdbc")
    .option("url", "jdbc:postgresql://172.17.0.2/postgres")
    .option("dbtable", "public.company")
    .option("user", "postgres")
    .option("password", "qw12aap")
    .option("driver", "org.postgresql.Driver")
    .load()
)
df_company.show()

method 2

dataframe = (
    spark.read.format("jdbc")
    .options(
        url="jdbc:postgresql://172.17.0.2/postgres?user=postgres&password=qw12aap",
        database="public",
        dbtable="company",
        driver="org.postgresql.Driver"
    )
    .load()
)
dataframe.show()