colab Google run pyspark

Using pyspark in google colab https://colab.research.google.com/

''' 1. first running script to init spark '''

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!apt-get install tar > /dev/null
!tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark


''' 2. first running script to init spark '''

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"


''' 3. first running script to init spark '''

import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, LongType, DecimalType

spark = SparkSession.builder.master("local[*]").getOrCreate()



''' 4. first Testing data to check it is working'''

df = spark.read.csv("./sample_data/california_housing_train.csv", inferSchema=True, header =True)
print (type(df))
df.printSchema()
df.show()

nes_Sch = StructType([StructField("Name",StructType([StructField("f_name",StringType(), True),StructField("l_name",StringType() , True)])),StructField("ID",IntegerType(),True),StructField("Add",StringType() , True)])
data1 = [(("John","cena"),123,"UK"),(("Singh","dd"),23,"IND")] 
b = spark.createDataFrame(data1, nes_Sch)
b.show()


''' shop table'''
schema = StructType([StructField("id", LongType(), True), StructField("raw_id", StringType(), True)])

df = spark.read.option("header",True).schema(schema).csv("./sample_data/shop.csv")
df.write.parquet("shop.parquet")
df.show()

Last updated