如何从 PySpark Dataframe 中的列中获取子字符串?
在本文中,我们将了解如何从 PySpark Dataframe 列中获取子字符串,以及如何创建新列并将子字符串放入新创建的列中。
我们可以使用substring()和substr()函数获取列的子字符串。
Syntax: substring(str,pos,len)
df.col_name.substr(start, length)
Parameter:
- str – It can be string or name of the column from which we are getting the substring.
- start and pos – Through this parameter we can give the starting position from where substring is start.
- length and len – It is the length of the substring from the starting position.
让我们创建一个数据框。
Python3
# importing necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, substring
# function to create new SparkSession
def create_session():
spk = SparkSession.builder \
.master("local") \
.appName("Substring.com") \
.getOrCreate()
return spk
def create_df(spark, data, schema):
df1 = spark.createDataFrame(data, schema)
return df1
if __name__ == "__main__":
input_data = [("India", +91, 2701, 2020),
("United States of America", +1, 1301, 2020),
("Israel", +972, 3102, 2020),
("Dubai", +971, 2901, 2020),
("Russia", 7, 3101, 2020)]
# calling function to create SparkSession
spark = create_session()
schema = ["Country", "Country Code",
"Data", "Year"]
# calling function to create dataframe
df = create_df(spark, input_data, schema)
df.show()
Python
if __name__ == "__main__":
# creating Month column and get the
# substring from the Data column
# creating Date column and get the
# substring from the Data column
df = df.withColumn(
"Month", substring("Data", 1, 2)).withColumn(
"Date", substring("Data", 3, 4))
# dropping the Data column from the
# Dataframe
df = df.drop("Data")
# printing Dataframe schema to get the
# column names
df.printSchema()
# visualizing the dataframe
df.show(truncate=False)
Python
if __name__ == "__main__":
# Creating the new column New_Country
# and store the substring using substr()
df = df.withColumn("New_Country", df.Country.substr(0, 12))
# printing Dataframe schema to get the
# column names
df.printSchema()
# visualizing the dataframe
df.show(truncate=False)
Python
if __name__ == "__main__":
input_data = [("India", +91, "AidanButler"),
("United States of America", +1, "ConerFlores"),
("Israel", +972, "RosseBryant"),
("Dubai", +971, "JuliaSimmon"),
("Russia", 7, "AliceBailey")]
# calling function to create SparkSession
spark = create_session()
schema = ["Country", "Country Code", "Name"]
# calling function to create dataframe
df = create_df(spark, input_data, schema)
# Selecting the column using select()
# function and getting substring
# using substring()
df2 = df.select('Name', substring('Name',
1, 5).alias('First Name'),
substring('Name', 6, 6).alias('Last Name'))
# printing Dataframe schema to get the column names
df2.printSchema()
# visualizing the dataframe
df2.show(truncate=False)
Python
if __name__ == "__main__":
input_data = [("India", +91, "AidanButler"),
("United States of America", +1, "ConerFlores"),
("Israel", +972, "RosseBryant"),
("Dubai", +971, "JuliaSimmon"),
("Russia", 7, "AliceBailey")]
# calling function to create SparkSession
spark = create_session()
schema = ["Country", "Country Code", "Name"]
# calling function to create dataframe
df = create_df(spark, input_data, schema)
# Selecting the column using selectExpr()
# function and getting substring using substring()
df2 = df.selectExpr('Name', 'substring(Name, 1,5) as First_Name',
'substring(Name, 6,6) as Last_Name')
# printing Dataframe schema to get the column names
df2.printSchema()
# visualizing the dataframe
df2.show(truncate=False)
输出:
示例 1:使用 substring() 获取子字符串并使用 withColumn()函数创建新列。
Python
if __name__ == "__main__":
# creating Month column and get the
# substring from the Data column
# creating Date column and get the
# substring from the Data column
df = df.withColumn(
"Month", substring("Data", 1, 2)).withColumn(
"Date", substring("Data", 3, 4))
# dropping the Data column from the
# Dataframe
df = df.drop("Data")
# printing Dataframe schema to get the
# column names
df.printSchema()
# visualizing the dataframe
df.show(truncate=False)
输出:
示例 2:通过使用 substr()函数获取子字符串来创建 New_Country 列。
Python
if __name__ == "__main__":
# Creating the new column New_Country
# and store the substring using substr()
df = df.withColumn("New_Country", df.Country.substr(0, 12))
# printing Dataframe schema to get the
# column names
df.printSchema()
# visualizing the dataframe
df.show(truncate=False)
输出:
示例 3:将 substring() 与 select()函数。
Python
if __name__ == "__main__":
input_data = [("India", +91, "AidanButler"),
("United States of America", +1, "ConerFlores"),
("Israel", +972, "RosseBryant"),
("Dubai", +971, "JuliaSimmon"),
("Russia", 7, "AliceBailey")]
# calling function to create SparkSession
spark = create_session()
schema = ["Country", "Country Code", "Name"]
# calling function to create dataframe
df = create_df(spark, input_data, schema)
# Selecting the column using select()
# function and getting substring
# using substring()
df2 = df.select('Name', substring('Name',
1, 5).alias('First Name'),
substring('Name', 6, 6).alias('Last Name'))
# printing Dataframe schema to get the column names
df2.printSchema()
# visualizing the dataframe
df2.show(truncate=False)
输出:
示例 4:将 substring() 与 selectExpr()函数。
Python
if __name__ == "__main__":
input_data = [("India", +91, "AidanButler"),
("United States of America", +1, "ConerFlores"),
("Israel", +972, "RosseBryant"),
("Dubai", +971, "JuliaSimmon"),
("Russia", 7, "AliceBailey")]
# calling function to create SparkSession
spark = create_session()
schema = ["Country", "Country Code", "Name"]
# calling function to create dataframe
df = create_df(spark, input_data, schema)
# Selecting the column using selectExpr()
# function and getting substring using substring()
df2 = df.selectExpr('Name', 'substring(Name, 1,5) as First_Name',
'substring(Name, 6,6) as Last_Name')
# printing Dataframe schema to get the column names
df2.printSchema()
# visualizing the dataframe
df2.show(truncate=False)
输出: