试图用SparseVector数据创建一个Spark数据框...怎么知道?
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Row
from pyspark.ml.linalg import SparseVector
train_schema = T.StructType([T.StructField('features', SparseVector()),\
T.StructField('SALESCLOSEPRICE', T.IntegerType())])
TypeError: __init__() missing 1 required positional argument: 'size'
train_df = spark\
.createDataFrame(\
[Row(features=SparseVector(135, {0: 139900.0, 1: 139900.0, 2: 980.0, 3: 10.0, 5: 980.0, 6: 1858.0, 7: 1858.0, 8: 980.0, 9: 1950.0, 10: 1.28, 11: 1.0, 12: 1.0, 15: 2.0, 16: 3.0, 20: 2017.0, 21: 7.0, 22: 28.0, 23: 15.0, 24: 196.0, 25: 25.0, 26: -1.0, 27: 4.03, 28: 3.96, 29: 3.88, 30: 3.9, 31: 3.91, 32: 9.8, 33: 22.4, 34: 67.8, 35: 49.8, 36: 11.9, 37: 2.7, 38: 0.2926, 39: 142.7551, 40: 980.0, 41: 0.0133, 42: 1.5, 43: 1.0, 51: -1.0, 52: -1.0, 53: -1.0, 54: -1.0, 55: -1.0, 56: -1.0, 57: -1.0, 62: 1.0, 68: 1.0, 77: 1.0, 81: 1.0, 89: 1.0, 95: 1.0, 96: 1.0, 101: 1.0, 103: 1.0, 108: 1.0, 114: 1.0, 115: 1.0, 123: 1.0, 133: 1.0}), SALESCLOSEPRICE=143000),
Row(features=SparseVector(135, {0: 210000.0, 1: 210000.0, 2: 1144.0, 3: 4.0, 5: 1268.0, 6: 1640.0, 7: 1640.0, 8: 2228.0, 9: 1971.0, 10: 0.32, 11: 1.0, 14: 2.0, 15: 3.0, 16: 4.0, 17: 960.0, 20: 2017.0, 21: 10.0, 22: 41.0, 23: 9.0, 24: 282.0, 25: 2.0, 26: -1.0, 27: 3.91, 28: 3.85, 29: 3.83, 30: 3.83, 31: 3.78, 32: 32.2, 33: 49.0, 34: 18.8, 35: 14.0, 36: 35.8, 37: 14.6, 38: 0.4392, 39: 94.2549, 40: 2228.0, 41: 0.0078, 42: 1.3333, 43: -1.0, 44: -1.0, 45: -1.0, 46: -1.0, 47: -1.0, 48: -1.0, 49: -1.0, 50: -1.0, 52: 1.0, 55: -1.0, 56: -1.0, 57: -1.0, 62: 1.0, 68: 1.0, 77: 1.0, 79: 1.0, 89: 1.0, 92: 1.0, 96: 1.0, 101: 1.0, 103: 1.0, 108: 1.0, 114: 1.0, 115: 1.0, 124: 1.0, 133: 1.0}), SALESCLOSEPRICE=190000),
Row(features=SparseVector(135, {0: 225000.0, 1: 225000.0, 2: 1102.0, 3: 28.0, 5: 1102.0, 6: 2390.0, 7: 2390.0, 8: 1102.0, 9: 1949.0, 10: 0.822, 11: 1.0, 15: 1.0, 16: 2.0, 20: 2017.0, 21: 6.0, 22: 26.0, 23: 26.0, 24: 177.0, 25: 25.0, 26: -1.0, 27: 3.88, 28: 3.9, 29: 3.91, 30: 3.89, 31: 3.94, 32: 9.8, 33: 22.4, 34: 67.8, 35: 61.7, 36: 2.7, 38: 0.4706, 39: 204.1742, 40: 1102.0, 41: 0.0106, 42: 2.0, 49: 1.0, 51: -1.0, 52: -1.0, 53: -1.0, 54: -1.0, 57: 1.0, 62: 1.0, 68: 1.0, 70: 1.0, 79: 1.0, 89: 1.0, 92: 1.0, 96: 1.0, 100: 1.0, 103: 1.0, 108: 1.0, 110: 1.0, 115: 1.0, 123: 1.0, 131: 1.0, 132: 1.0}), SALESCLOSEPRICE=225000)],\
schema=train_schema)
最佳答案
您可以使用VectorUDT在模式中指定稀疏向量。当VectorUDT是StructType列表的第一个元素时,您会遇到问题,因为在创建行对象时,将按字母顺序排序(documentation中也提到了该顺序)。
r = Row(features=SparseVector(135, {0: 139900.0, 1: 139900.0, ...}), SALESCLOSEPRICE=143000)
r.__fields__
输出:
['SALESCLOSEPRICE', 'features']
完整代码:
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Row
from pyspark.ml.linalg import VectorUDT, SparseVector
#Does not work due to schema conflicts:
#train_schema = T.StructType([
# T.StructField('features', VectorUDT()),
# T.StructField('SALESCLOSEPRICE', T.IntegerType())
# ])
#Raises: TypeError: cannot serialize 143000 of type <class 'int'>
#works somehow:
train_schema = T.StructType([
T.StructField('SALESCLOSEPRICE', T.IntegerType()),
T.StructField('features', VectorUDT())
])
train_df = spark.createDataFrame(
[Row(features=SparseVector(135, {0: 139900.0, 1: 139900.0, 2: 980.0, 3: 10.0, 5: 980.0, 6: 1858.0, 7: 1858.0, 8: 980.0, 9: 1950.0, 10: 1.28, 11: 1.0, 12: 1.0, 15: 2.0, 16: 3.0, 20: 2017.0, 21: 7.0, 22: 28.0, 23: 15.0, 24: 196.0, 25: 25.0, 26: -1.0, 27: 4.03, 28: 3.96, 29: 3.88, 30: 3.9, 31: 3.91, 32: 9.8, 33: 22.4, 34: 67.8, 35: 49.8, 36: 11.9, 37: 2.7, 38: 0.2926, 39: 142.7551, 40: 980.0, 41: 0.0133, 42: 1.5, 43: 1.0, 51: -1.0, 52: -1.0, 53: -1.0, 54: -1.0, 55: -1.0, 56: -1.0, 57: -1.0, 62: 1.0, 68: 1.0, 77: 1.0, 81: 1.0, 89: 1.0, 95: 1.0, 96: 1.0, 101: 1.0, 103: 1.0, 108: 1.0, 114: 1.0, 115: 1.0, 123: 1.0, 133: 1.0}), SALESCLOSEPRICE=143000),
Row(features=SparseVector(135, {0: 210000.0, 1: 210000.0, 2: 1144.0, 3: 4.0, 5: 1268.0, 6: 1640.0, 7: 1640.0, 8: 2228.0, 9: 1971.0, 10: 0.32, 11: 1.0, 14: 2.0, 15: 3.0, 16: 4.0, 17: 960.0, 20: 2017.0, 21: 10.0, 22: 41.0, 23: 9.0, 24: 282.0, 25: 2.0, 26: -1.0, 27: 3.91, 28: 3.85, 29: 3.83, 30: 3.83, 31: 3.78, 32: 32.2, 33: 49.0, 34: 18.8, 35: 14.0, 36: 35.8, 37: 14.6, 38: 0.4392, 39: 94.2549, 40: 2228.0, 41: 0.0078, 42: 1.3333, 43: -1.0, 44: -1.0, 45: -1.0, 46: -1.0, 47: -1.0, 48: -1.0, 49: -1.0, 50: -1.0, 52: 1.0, 55: -1.0, 56: -1.0, 57: -1.0, 62: 1.0, 68: 1.0, 77: 1.0, 79: 1.0, 89: 1.0, 92: 1.0, 96: 1.0, 101: 1.0, 103: 1.0, 108: 1.0, 114: 1.0, 115: 1.0, 124: 1.0, 133: 1.0}), SALESCLOSEPRICE=190000),
Row(features=SparseVector(135, {0: 225000.0, 1: 225000.0, 2: 1102.0, 3: 28.0, 5: 1102.0, 6: 2390.0, 7: 2390.0, 8: 1102.0, 9: 1949.0, 10: 0.822, 11: 1.0, 15: 1.0, 16: 2.0, 20: 2017.0, 21: 6.0, 22: 26.0, 23: 26.0, 24: 177.0, 25: 25.0, 26: -1.0, 27: 3.88, 28: 3.9, 29: 3.91, 30: 3.89, 31: 3.94, 32: 9.8, 33: 22.4, 34: 67.8, 35: 61.7, 36: 2.7, 38: 0.4706, 39: 204.1742, 40: 1102.0, 41: 0.0106, 42: 2.0, 49: 1.0, 51: -1.0, 52: -1.0, 53: -1.0, 54: -1.0, 57: 1.0, 62: 1.0, 68: 1.0, 70: 1.0, 79: 1.0, 89: 1.0, 92: 1.0, 96: 1.0, 100: 1.0, 103: 1.0, 108: 1.0, 110: 1.0, 115: 1.0, 123: 1.0, 131: 1.0, 132: 1.0}), SALESCLOSEPRICE=225000)
], schema=train_schema)
train_df.show()
输出:
+---------------+--------------------+
|SALESCLOSEPRICE| features|
+---------------+--------------------+
| 143000|(135,[0,1,2,3,5,6...|
| 190000|(135,[0,1,2,3,5,6...|
| 225000|(135,[0,1,2,3,5,6...|
+---------------+--------------------+
关于python - 如何使用SparseVector功能构造架构?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/56411124/