-
Notifications
You must be signed in to change notification settings - Fork 82
Description
Describe the bug
when using a generic wildcard issues arise when lots of files (eg: path_to_folder/* or path_to_folder)
workaround is to reduce the number of files by using more specific wildcards (eg per year : path_to_folder/D2016* ; path_to_folder/D2017* ; path_to_folder/D2018* ; path_to_folder/D2019* ; path_to_folder/D2020*
Name: java.io.FileNotFoundException Message: File file:/home/jovyan/data/SOURCE/BRAND/initial_transformed/* does not exist StackTrace: at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:431) at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1517) at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1557) at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:674) at za.co.absa.cobrix.spark.cobol.utils.FileUtils$.findAndLogFirstNonDivisibleFile(FileUtils.scala:198) at za.co.absa.cobrix.spark.cobol.source.scanners.CobolScanners$.areThereNonDivisibleFiles(CobolScanners.scala:107) at za.co.absa.cobrix.spark.cobol.source.scanners.CobolScanners$.buildScanForFixedLength(CobolScanners.scala:87) at za.co.absa.cobrix.spark.cobol.source.CobolRelation.buildScan(CobolRelation.scala:90) at org.apache.spark.sql.execution.datasources.DataSourceStrategy.apply(DataSourceStrategy.scala:308) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:63) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:63) at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:78) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:75) at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157) at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157) at scala.collection.Iterator$class.foreach(Iterator.scala:891) at scala.collection.AbstractIterator.foreach(Iterator.scala:1334) at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157) at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:75) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:67) at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441) at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:78) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:75) at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157) at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157) at scala.collection.Iterator$class.foreach(Iterator.scala:891) at scala.collection.AbstractIterator.foreach(Iterator.scala:1334) at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157) at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:75) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:67) at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441) at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93) at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:73) at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:69) at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:78) at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:78) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3365) at org.apache.spark.sql.Dataset.head(Dataset.scala:2550) at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
To Reproduce
Steps to reproduce the behaviour OR commands run:
%AddDeps za.co.absa.cobrix spark-cobol_2.11 2.0.3 --transitive
val sparkBuilder = SparkSession.builder().appName("Example") val spark = sparkBuilder .getOrCreate()
`
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
spark.udf.register("get_file_name", (path: String) => path.split("/").last)
val cobolDataframe = spark
.read
.format("za.co.absa.cobrix.spark.cobol.source")
.option("pedantic", "true")
.option("copybook", "file:///home/jovyan/data/SOURCE/COPYBOOK.txt")
.load("file:///home/jovyan/data/SOURCE/BRAND/initial_transformed/*")
.withColumn("DPSource", callUDF("get_file_name", input_file_name()))
cobolDataframe
//.filter("RECORD.ID % 2 = 0") // filter the even values of the nested field 'RECORD_LENGTH'
.take(20)
.foreach(v => println(v))
`
Expected behaviour
get data
Screenshots
If applicable, add screenshots to help explain your problem.
Additional context
I Thank you for feedback