错误信息:

RuntimeError: ('Exception thrown when converting pandas.Series (object) to Arrow Array (None). It can be caused by overflows or other unsafe conversions warned by Arrow. Arrow safe type check can be disabled by using SQL config `spark.sql.execution.pandas.arrowSafeTypeConversion`.', ArrowInvalid('Could not convert DenseVector([1.6486, -4.0133, -1.0091]) with type DenseVector: did not recognize Python value type when inferring an Arrow data type'))


 

#%python
#%input=command
#%output=python_output_table
#%schema=st(field(_id,string),field(x,array(double)))
#%runIn=driver
#%dataMode=model
#%cache=false
#%env=source /home/byzerllm/miniconda3/bin/activate  byzerllm-dev
#%pythonExec=/home/byzerllm/miniconda3/envs/byzerllm-dev/bin/python

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

from pyspark.ml.feature import PCA

from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = spark.createDataFrame(data, ["features"])

pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)


rows = model.transform(df).select("pcaFeatures").collect()
for row in rows:
    print(row)


id_count = 1


def handle_record(row):
    global id_count
    item = {"_id": str(id_count)}
    id_count += 1
    item["x"] = row[0]
    print("row0: %s" % row[0])
    return item


result = [handle_record(row) for row in rows]


context.build_result(result)

Logo

更多推荐