- from pyspark.sql import SparkSession
- import argparse
- import argparse
- parser=argparse.ArgumentParser(description="args for validation script")
- parser.add_argument('--input_gcs_path',type=str)
- parser.add_argument('--input_header',type=str) //True or False
- parser.add_argument('--dest_bq_project',type=str)
- parser.add_argument('--dest_bq_dataset',type=str)
- parser.add_argument('--dest_bq_table',type=str)
- parser.add_argument('--dest_bq_filter',type=str)
- parser.add_argument('--col_list',type=str)
- parser.add_argument('--bq_drop_col_list',type=str) //'ed_ts', 'ed_date'
- parser.add_argument('--compare_columns_flag',type=str)
- parser.add_argument('--compare_rows_flag',type=str)
- parser.add_argument('--bad_rec_gcs_dir',type=str)
- args=parser.parse_args()
- input_gcs_path=args.input_gcs_path
- input_header=args.input_header
- dest_bq_project=args.dest_bq_project
- dest_bq_dataset=args.dest_bq_dataset
- dest_bq_table=args.dest_bq_table
- dest_bq_filter=args.dest_bq_filter
- col_list=args.col_list
- bq_drop_col_list=args.bq_drop_col_list
- compare_columns_flag=args.compare_columns_flag
- compare_rows_flag=args.compare_rows_flag
- bad_rec_gcs_dir=args.bad_rec_gcs_dir
- # Create a Spark session
- spark = SparkSession.builder \
- .appName("data validation") \
- .getOrCreate()
- if compare_rows_flag=="True":
- print("comparing rowwise dataframe")
- df_gcs = spark.read.csv(input_gcs_path, header=input_header)
- df_gcs.show()
- # Read data from BigQuery into a DataFrame
- df_bq = spark.read \
- .format("bigquery") \
- .option("table", dest_bq_table) \
- .option("filter", dest_bq_filter) \
- .option("project", dest_bq_project) \
- .option("dataset", dest_bq_dataset) \
- .load()
- df_bq.show()
- df_bq_new=df_bq.drop(bq_drop_col_list)
- df_bq_new.show()
- diff_df_gcs_bq=df_gcs.subtract(df_bq_new)
- diff_df_bq_gcs=df_bq_new.subtract(df_gcs)
- # Save DataFrame to GCS as CSV
- diff_df_gcs_bq.write.csv(bad_rec_gcs_dir+"/"+dest_bq_dataset+"/"+dest_bq_table+"/diff_df_gcs_bq/", mode="overwrite", header=True)
- diff_df_bq_gcs.write.csv(bad_rec_gcs_dir+"/"dest_bq_dataset+"/"+dest_bq_table+"/diff_df_bq_gcs/", mode="overwrite", header=True)
- if compare_columns_flag=="True":
[text] val
Viewer
*** This page was generated with the meta tag "noindex, nofollow". This happened because you selected this option before saving or the system detected it as spam. This means that this page will never get into the search engines and the search bot will not crawl it. There is nothing to worry about, you can still share it with anyone.
Editor
You can edit this paste and save as new: