[Addon kubevela#603] Add Apache Spark as a experimental addon

Signed-off-by: yanghua <[email protected]>
yanghua · Feb 27, 2023 · 843319f · 843319f
1 parent 285efc3
commit 843319f
Show file tree

Hide file tree

Showing 5 changed files with 396 additions and 0 deletions.
diff --git a/experimental/addons/spark-kubernetes-operator/README.md b/experimental/addons/spark-kubernetes-operator/README.md
@@ -0,0 +1,170 @@
+# spark-kubernetes-operator
+
+A kubernetes operator for Apache Spark(https://github.com/GoogleCloudPlatform/spark-on-k8s-operator), it allows users to manage Spark applications and their lifecycle through native K8S tooling like `kubectl`.
+
+> Note: It's not provided by Apache Spark. But widely used by a large number of companies(https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/blob/master/docs/who-is-using.md).
+
+# Install
+
+```
+#The following steps are for enabling fluxcd and spark-kubernetes-operator in namespace called "spark-operator".
+
+vela addon enable fluxcd
+vela addon enable spark-kubernetes-operator
+```
+
+# Uninstall
+
+```
+vela addon disable spark-kubernetes-operator
+vela addon disable fluxcd
+```
+
+# To check the spark-kubernetes-operator running status
+
+* Firstly, check the spark-kubernetes-operator (and the fluxcd and we need to deploy by helm) running status
+
+```
+vela addon status spark-kubernetes-operator
+vela ls -A | grep spark
+```
+
+* Secondly, show the component type `spark-cluster`, so we know how to use it in one application. As a spark user, you can choose the parameter to set for your spark cluster.
+
+```
+vela show spark-application
+# Specification
++---------------------+------------------------------------------------------------------------------------------------------+-------------------+----------+---------+
+|        NAME         |                                             DESCRIPTION                                              |       TYPE        | REQUIRED | DEFAULT |
++---------------------+------------------------------------------------------------------------------------------------------+-------------------+----------+---------+
+| name                | Specify the spark application name.                                                                  | string            | true     |         |
+| namespace           | Specify the namespace for spark application to install.                                              | string            | true     |         |
+| type                | Specify the application language type, e.g. "Scala", "Python", "Java" or "R".                        | string            | true     |         |
+| pythonVersion       | Specify the python version.                                                                          | string            | false    |         |
+| mode                | Specify the deploy mode, e.go "cluster", "client" or "in-cluster-client".                            | string            | true     |         |
+| image               | Specify the container image for the driver, executor, and init-container.                            | string            | true     |         |
+| imagePullPolicy     | Specify the image pull policy for the driver, executor, and init-container.                          | string            | true     |         |
+| mainClass           | Specify the fully-qualified main class of the Spark application.                                     | string            | true     |         |
+| mainApplicationFile | Specify the path to a bundled JAR, Python, or R file of the application.                             | string            | true     |         |
+| sparkVersion        | Specify the version of Spark the application uses.                                                   | string            | true     |         |
+| driverCores         | Specify the number of CPU cores to request for the driver pod.                                       | int               | true     |         |
+| executorCores       | Specify the number of CPU cores to request for the executor pod.                                     | int               | true     |         |
+| arguments           | Specify a list of arguments to be passed to the application.                                         | []string          | false    |         |
+| sparkConf           | Specify the config information carries user-specified Spark configuration properties as they would   | map[string]string | false    |         |
+|                     | use the  "--conf" option in spark-submit.                                                            |                   |          |         |
+| hadoopConf          | Specify the config information carries user-specified Hadoop configuration properties as they would  | map[string]string | false    |         |
+|                     | use the  the "--conf" option in spark-submit.  The SparkApplication controller automatically adds    |                   |          |         |
+|                     | prefix "spark.hadoop." to Hadoop configuration properties.                                           |                   |          |         |
+| sparkConfigMap      | Specify the name of the ConfigMap containing Spark configuration files such as log4j.properties. The | string            | false    |         |
+|                     | controller will add environment variable SPARK_CONF_DIR to the path where the ConfigMap is mounted   |                   |          |         |
+|                     | to.                                                                                                  |                   |          |         |
+| hadoopConfigMap     | Specify the name of the ConfigMap containing Hadoop configuration files such as core-site.xml. The   | string            | false    |         |
+|                     | controller will add environment variable HADOOP_CONF_DIR to the path where the ConfigMap is mounted  |                   |          |         |
+|                     | to.                                                                                                  |                   |          |         |
++---------------------+------------------------------------------------------------------------------------------------------+-------------------+----------+---------+
+
+```
+
+# Example for how to run a component typed spark-cluster in application
+
+1. Firstly, copy the following example to "spark-app-v1.yaml":
+
+> The addon will create a namespace named "spark-cluster"
+
+```
+apiVersion: core.oam.dev/v1beta1
+kind: Application
+metadata:
+  name: spark-app-v1
+  namespace: spark-cluster
+spec:
+  components:
+  - name: my-spark-application-component
+    type: spark-application
+    properties:
+      name: my-spark-app
+      namespace: spark-cluster
+      type: Scala
+      mode: cluster
+      image: "gcr.io/spark-operator/spark:v3.1.1"
+      imagePullPolicy: Always
+      mainClass: org.apache.spark.examples.streaming.JavaQueueStream
+      mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.1.1.jar"
+      sparkVersion: "3.1.1"
+      driverCores: 1
+      executorCores: 1
+```
+
+2. Secondly, start the application:
+
+```
+vela up -f spark-app-v1.yaml
+```
+
+You will see the stdout like this:
+
+```
+Applying an application in vela K8s object format...
+I0227 16:54:37.069480  361176 apply.go:121] "creating object" name="spark-app-v1" resource="core.oam.dev/v1beta1, Kind=Application"
+✅ App has been deployed 🚀🚀🚀
+    Port forward: vela port-forward spark-app-v1 -n spark-cluster
+             SSH: vela exec spark-app-v1 -n spark-cluster
+         Logging: vela logs spark-app-v1 -n spark-cluster
+      App status: vela status spark-app-v1 -n spark-cluster
+        Endpoint: vela status spark-app-v1 -n spark-cluster --endpoint
+Application spark-cluster/spark-app-v1 applied.
+```
+
+3. Then, you can use the native command to check the status of the Spark applicaiton:
+
+```
+$ kubectl get sparkapplications -n spark-cluster
+NAME           STATUS    ATTEMPTS   START                  FINISH       AGE
+my-spark-app   RUNNING   1          2023-02-27T08:54:40Z   <no value>   2m33s
+```
+
+or get the application detail via this command:
+
+```
+$ kubectl describe sparkapplication my-spark-app -n spark-cluster
+Name:         my-spark-app
+Namespace:    spark-cluster
+Labels:       app.oam.dev/app-revision-hash=4e5592aea53a5961
+              app.oam.dev/appRevision=spark-app-v1-v1
+              app.oam.dev/cluster=local
+              app.oam.dev/component=my-spark-application-component
+              app.oam.dev/name=spark-app-v1
+              app.oam.dev/namespace=spark-cluster
+              app.oam.dev/resourceType=TRAIT
+              app.oam.dev/revision=
+              oam.dev/render-hash=640a3298d803274e
+              trait.oam.dev/resource=spark
+              trait.oam.dev/type=AuxiliaryWorkload
+Annotations:  app.oam.dev/last-applied-configuration:
+                {"apiVersion":"sparkoperator.k8s.io/v1beta2","kind":"SparkApplication","metadata":{"annotations":{"app.oam.dev/last-applied-time":"2023-02...
+              app.oam.dev/last-applied-time: 2023-02-27T16:54:37+08:00
+              oam.dev/kubevela-version: v1.7.0
+API Version:  sparkoperator.k8s.io/v1beta2
+Kind:         SparkApplication
+Metadata:
+......
+```
+
+or get the general purpose detail information via this command:
+
+```
+$ kubectl get app spark-app-v1 -n spark-cluster -oyaml
+apiVersion: core.oam.dev/v1beta1
+kind: Application
+metadata:
+......
+```
+
+4. Show the service of spark application via this command:
+
+```
+$ kubectl get svc -n spark-cluster
+NAME                                       TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)                      AGE
+my-spark-app-c58a1c869214bfe5-driver-svc   ClusterIP   None             <none>        7078/TCP,7079/TCP,4040/TCP   19m
+my-spark-app-ui-svc                        ClusterIP   xx.xx.xx.xx   <none>        4040/TCP                     19m
+```
diff --git a/experimental/addons/spark-kubernetes-operator/definitions/spark-application.cue b/experimental/addons/spark-kubernetes-operator/definitions/spark-application.cue
@@ -0,0 +1,90 @@
+"spark-application": {
+	annotations: {}
+	attributes: workload: type: "autodetects.core.oam.dev"
+	description: "Describes a containerized spark application that can specify resource spec."
+	labels: {}
+	type: "component"
+}
+
+template: {
+	parameter: {
+		// +usage=Specify the spark application name
+		name: string
+		// +usage=Specify the namespace for spark application to install
+ 		namespace: string
+		// +usage=Specify the application language type, e.g. "Scala", "Python", "Java" or "R"
+		type: string
+		// +usage=Specify the python version 
+		pythonVersion ?: string
+		// +usage=Specify the deploy mode, e.go "cluster", "client" or "in-cluster-client"
+		mode: string
+		// +usage=Specify the container image for the driver, executor, and init-container
+		image: string
+		// +usage=Specify the image pull policy for the driver, executor, and init-container
+		imagePullPolicy: string
+		// +usage=Specify the fully-qualified main class of the Spark application
+		mainClass: string
+		// +usage=Specify the path to a bundled JAR, Python, or R file of the application
+		mainApplicationFile: string
+		// +usage=Specify the version of Spark the application uses
+		sparkVersion: string
+		// +usage=Specify the number of CPU cores to request for the driver pod
+		driverCores: int
+		// +usage=Specify the number of CPU cores to request for the executor pod
+		executorCores: int
+		// +usage=Specify a list of arguments to be passed to the application
+		arguments ?: [...string]
+		// +usage=Specify the config information carries user-specified Spark configuration properties as they would use the  "--conf" option in spark-submit
+		sparkConf ?: [string]: string
+		// +usage=Specify the config information carries user-specified Hadoop configuration properties as they would use the  the "--conf" option in spark-submit.  The SparkApplication controller automatically adds prefix "spark.hadoop." to Hadoop configuration properties
+		hadoopConf ?: [string]: string
+		// +usage=Specify the name of the ConfigMap containing Spark configuration files such as log4j.properties. The controller will add environment variable SPARK_CONF_DIR to the path where the ConfigMap is mounted to
+		sparkConfigMap ?: string
+		// +usage=Specify the name of the ConfigMap containing Hadoop configuration files such as core-site.xml. The controller will add environment variable HADOOP_CONF_DIR to the path where the ConfigMap is mounted to
+		hadoopConfigMap ?: string
+
+	}
+
+    output: {
+        kind:       "ClusterRoleBinding"
+        apiVersion: "rbac.authorization.k8s.io/v1"
+        metadata: name: parameter.name
+        roleRef: {
+            name:     "edit"
+            apiGroup: "rbac.authorization.k8s.io"
+            kind:     "ClusterRole"
+        }
+        subjects: [{
+            name:      "default"
+            kind:      "ServiceAccount"
+            namespace: parameter.namespace
+        }]
+    }
+
+	outputs: {
+
+		"spark": {
+		    kind: "SparkApplication"
+            apiVersion: "sparkoperator.k8s.io/v1beta2"
+		    metadata: {
+				name: 	   parameter.name
+				namespace: parameter.namespace
+		    }
+		    spec: {
+				type: parameter.type
+				mode: parameter.mode
+				image: parameter.image
+				imagePullPolicy: parameter.imagePullPolicy
+				mainClass: parameter.mainClass
+				mainApplicationFile: parameter.mainApplicationFile
+				sparkVersion: parameter.sparkVersion
+				driver: {
+			    	cores: parameter.driverCores
+				}
+				executor: {
+			    	cores: parameter.executorCores	
+				}
+		    }
+		}
+	}
+}
diff --git a/experimental/addons/spark-kubernetes-operator/metadata.yaml b/experimental/addons/spark-kubernetes-operator/metadata.yaml
@@ -0,0 +1,15 @@
+description: A kubernetes operator for Apache Spark
+icon: "https://spark.apache.org/images/spark-logo.png"
+invisible: false
+name: spark-kubernetes-operator
+tags:
+  - GoogleCloudPlatform/spark-on-k8s-operator
+version: v1beta2-1.3.8-3.1.1
+url: https://github.com/GoogleCloudPlatform/spark-on-k8s-operator
+
+dependencies:
+  - name: fluxcd
+
+system:
+  vela: ">=1.5.0-beta.3"
+  kubernetes: ">=1.16"
diff --git a/experimental/addons/spark-kubernetes-operator/parameter.cue b/experimental/addons/spark-kubernetes-operator/parameter.cue
@@ -0,0 +1,20 @@
+// parameter.cue is used to store addon parameters.
+//
+// You can use these parameters in template.cue or in resources/ by 'parameter.myparam'
+//
+// For example, you can use parameters to allow the user to customize
+// container images, ports, and etc.
+parameter: {
+	// +usage=Deploy to specified clusters. Leave empty to deploy to all clusters.
+        clusters?: [...string]
+	// +usage=Namespace to deploy to
+        namespace: *"spark-operator" | string
+	// +usage=Specify if create  the webhook or not
+        "createWebhook": *false | bool
+	// +usage=Specify the image repository
+        "imageRepository": *"ghcr.io/googlecloudplatform/spark-operator" | string
+        // +usage=Specify the image tag
+        "imageTag": *"v1beta2-1.3.8-3.1.1" | string
+	// +usage=Specify if create the sa for job or not
+        "createSparkServiceAccount": *false|bool
+}