@bfgf52 您好,我在jupyter中运行这个文件中的代码
运行到这一步时报错:samplesWithUserFeatures = addUserFeatures(samplesWithMovieFeatures)
报错如下,麻烦帮忙看看问题出在哪?谢谢:
Py4JJavaError: An error occurred while calling o233.withColumn.
: org.apache.spark.sql.AnalysisException: cannot resolve 'reverse(userPositiveHistory
)' due to data type mismatch: argument 1 requires string type, however, 'userPositiveHistory
' is of array type.;;
'Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, genres#12, releaseYear#122, movieGenre1#147, movieGenre2#156, movieGenre3#166, movieRatingCount#233L, movieAvgRating#190, movieRatingStddev#239, reverse(userPositiveHistory#354) AS userPositiveHistory#370]
+- Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, genres#12, releaseYear#122, movieGenre1#147, movieGenre2#156, movieGenre3#166, movieRatingCount#233L, movieAvgRating#190, movieRatingStddev#239, userPositiveHistory#354]
+- Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, genres#12, releaseYear#122, movieGenre1#147, movieGenre2#156, movieGenre3#166, movieRatingCount#233L, movieAvgRating#190, movieRatingStddev#239, _w0#355, userPositiveHistory#354, userPositiveHistory#354]
+- Window [collect_list(_w0#355, 0, 0) windowspecdefinition(userId#26, timestamp#29 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -100, -1)) AS userPositiveHistory#354], [userId#26], [timestamp#29 ASC NULLS FIRST]
+- Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, genres#12, releaseYear#122, movieGenre1#147, movieGenre2#156, movieGenre3#166, movieRatingCount#233L, movieAvgRating#190, movieRatingStddev#239, CASE WHEN (label#88 = 1) THEN movieId#27 ELSE cast(null as string) END AS _w0#355]
+- Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, genres#12, releaseYear#122, movieGenre1#147, movieGenre2#156, movieGenre3#166, movieRatingCount#233L, movieAvgRating#190, movieRatingStddev#239]
+- Join LeftOuter, (movieId#27 = movieId#245)
:- Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, genres#12, releaseYear#122, movieGenre1#147, movieGenre2#156, split(genres#12, |)[2] AS movieGenre3#166]
: +- Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, genres#12, releaseYear#122, movieGenre1#147, split(genres#12, |)[1] AS movieGenre2#156]
: +- Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, genres#12, releaseYear#122, split(genres#12, |)[0] AS movieGenre1#147]
: +- Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, genres#12, releaseYear#122]
: +- Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, (title#11) AS title#131, genres#12, releaseYear#122]
: +- Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, title#11, genres#12, extractReleaseYearUdf(title#11) AS releaseYear#122]
: +- Project [movieId#27, userId#26, rating#28, timestamp#29, label#88, title#11, genres#12]
: +- Join LeftOuter, (movieId#27 = movieId#10)
: :- Project [userId#26, movieId#27, rating#28, timestamp#29, CASE WHEN (cast(rating#28 as double) >= 3.5) THEN 1 ELSE 0 END AS label#88]
: : +- Relation[userId#26,movieId#27,rating#28,timestamp#29] csv
: +- Relation[movieId#10,title#11,genres#12] csv
+- Project [movieId#245, movieRatingCount#233L, movieAvgRating#190, format_number(movieRatingStddev#234, 2) AS movieRatingStddev#239]
+- Project [movieId#245, coalesce(movieRatingCount#188L, cast(0.0 as bigint)) AS movieRatingCount#233L, movieAvgRating#190, coalesce(nanvl(movieRatingStddev#200, cast(null as double)), cast(0.0 as double)) AS movieRatingStddev#234]
+- Aggregate [movieId#245], [movieId#245, count(1) AS movieRatingCount#188L, format_number(avg(cast(rating#246 as double)), 2) AS movieAvgRating#190, stddev_samp(cast(rating#246 as double)) AS movieRatingStddev#200]
+- Project [movieId#245, userId#244, rating#246, timestamp#247, label#88, genres#12, releaseYear#122, movieGenre1#147, movieGenre2#156, split(genres#12, |)[2] AS movieGenre3#166]
+- Project [movieId#245, userId#244, rating#246, timestamp#247, label#88, genres#12, releaseYear#122, movieGenre1#147, split(genres#12, |)[1] AS movieGenre2#156]
+- Project [movieId#245, userId#244, rating#246, timestamp#247, label#88, genres#12, releaseYear#122, split(genres#12, |)[0] AS movieGenre1#147]
+- Project [movieId#245, userId#244, rating#246, timestamp#247, label#88, genres#12, releaseYear#122]
+- Project [movieId#245, userId#244, rating#246, timestamp#247, label#88, (title#11) AS title#131, genres#12, releaseYear#122]
+- Project [movieId#245, userId#244, rating#246, timestamp#247, label#88, title#11, genres#12, extractReleaseYearUdf(title#11) AS releaseYear#122]
+- Project [movieId#245, userId#244, rating#246, timestamp#247, label#88, title#11, genres#12]
+- Join LeftOuter, (movieId#245 = movieId#10)
:- Project [userId#244, movieId#245, rating#246, timestamp#247, CASE WHEN (cast(rating#246 as double) >= 3.5) THEN 1 ELSE 0 END AS label#88]
: +- Relation[userId#244,movieId#245,rating#246,timestamp#247] csv
+- Relation[movieId#10,title#11,genres#12] csv