***How to replace junk characters coming in data in spark Data Frame without specifying column name***
import re
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import*
input = [['furqan?*'],['hello??']]
df = spark.createDataFrame(input,['col1'])
df.show()
+--------+
| col1|
+--------+
|furqan?*|
| hello??|
+--------+
udf = UserDefinedFunction(lambda x: re.sub('[^a-zA-Z0-9 \n\.]', '', x),StringType())
new_df = df.select(*[udf(column).alias(column) for column in df.columns])
new_df.show()
+------+
| col1|
+------+
|furqan|
| hello|
+------+
No comments:
Post a Comment