from pyspark.sql import SparkSession
# 初始化SparkSession
spark = SparkSession.builder \
.appName("DataProcessing") \
.getOrCreate()
# 加载数据
data_path = "hdfs://localhost:9000/data"
df = spark.read.csv(data_path, header=True)
# 显示前几行
df.show(5)
]]>
from pyspark.ml.regression import LinearRegression
# 定义特征列和目标列
feature_cols = ["feature1", "feature2"]
target_col = "target"
# 创建LinearRegression实例
lr = LinearRegression(featuresCol=feature_cols, labelCol=target_col)
# 训练模型
model = lr.fit(df)
# 输出模型摘要
summary = model.summary
print("RMSE:", summary.rootMeanSquaredError)
]]>