Remove Junk Characters from PySpark Data Frame


***How to replace junk characters coming in data in spark Data Frame without specifying column name***


import re

from pyspark.sql.functions import UserDefinedFunction

from pyspark.sql.types import*


input = [['furqan?*'],['hello??']]


df = spark.createDataFrame(input,['col1'])


df.show()

+--------+

|    col1|

+--------+

|furqan?*|

| hello??|

+--------+


udf = UserDefinedFunction(lambda x: re.sub('[^a-zA-Z0-9 \n\.]', '', x),StringType())


new_df = df.select(*[udf(column).alias(column) for column in df.columns])


new_df.show()


+------+

|  col1|

+------+

|furqan|

| hello|

+------+


No comments:

Post a Comment

Pages