Wednesday, October 13, 2021

PySpark - check if directory exists

 def path_exists(path):

    # spark is a SparkSession

    sc = spark.sparkContext

    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(

        sc._jvm.java.net.URI.create("s3://" + path.split("/")[2]),

        sc._jsc.hadoopConfiguration(),

    )

    return fs.exists(sc._jvm.org.apache.hadoop.fs.Path(path))