Spark: Difference between revisions

From Chorke Wiki
Jump to navigation Jump to search
 
(29 intermediate revisions by the same user not shown)
Line 44: Line 44:
  sudo systemctl start  spark-slave.service
  sudo systemctl start  spark-slave.service
  sudo systemctl status spark-slave.service
  sudo systemctl status spark-slave.service
== Knowledge ==
{|
|valign="top" colspan="3"|
ssh-keygen -b 4096 -t rsa -f ~/.ssh/id_rsa -q -N "spark@${HOSTNAME}"
readlink -f /usr/bin/java | sed "s:bin/java::"
sudo apt-get install pdsh
sudo apt-get install ssh
|-
|colspan="3"|
----
|-
|valign="top" colspan="3"|
su -h spark
sudo -u spark -H sh -c "whoami; echo ${HOME}"
sh $SPARK_HOME/bin/spark-shell
sh $SPARK_HOME/bin/pyspark
http://127.0.0.1:8080
http://127.0.0.1:8088
http://127.0.0.1:9870
http://127.0.0.1:4040
|-
|colspan="3"|
----
|-
|valign="bottom" colspan="2"|
sudo apt dist-upgrade
sudo do-release-upgrade
sudo apt --fix-broken install
sudo apt install ubuntu-desktop
|valign="top"|
<source lang="ini">
[Service]
User=spark
Group=spark
Type=forking
SuccessExitStatus=143
</source>
|-
|colspan="3"|
----
|-
|valign="top" colspan="3"|
<source lang="bash">
if [ -f '/etc/os-release' ];then
        HOST_OS_ID=$(grep -oP '(?<=^ID=).+'        /etc/os-release | tr -d '"')
    HOST_OS_ID_LIKE=$(grep -oP '(?<=^ID_LIKE=).+'    /etc/os-release | tr -d '"')
    HOST_OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
fi
</source>
|}


==References==
==References==
{|
{|
| valign="top" |
| valign="top" |
* [https://stackoverflow.com/questions/73853179/ Spark » Systemd » Failed to stop Master/Slave Node]
* [https://archive.apache.org/dist/spark/ Spark » Download » Archive]
* [https://archive.apache.org/dist/spark/ Spark » Download » Archive]
* [https://stackoverflow.com/questions/36083783/ Spark » Change WebUI Port]
* [https://spark.apache.org/sql/ Spark » SQL & DataFrames]
* [https://spark.apache.org/sql/ Spark » SQL & DataFrames]
* [https://spark.apache.org/mllib/ Spark » Machine Learning]
* [https://spark.apache.org/mllib/ Spark » Machine Learning]
Line 55: Line 115:
* [https://spark.apache.org/examples.html Spark » Examples]
* [https://spark.apache.org/examples.html Spark » Examples]
* [https://spark.apache.org/graphx/ Spark » GraphX]
* [https://spark.apache.org/graphx/ Spark » GraphX]
* [[Hadoop]]
* [https://cdn.chorke.org/exec/cli/bash/install/ Spark » Install]
* [[Jupyter]]
* [[NLP]]


| valign="top" |
| valign="top" |
* [https://stackoverflow.com/questions/31450846/ Spark » Concatenate columns in DataFrame]
* [https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html Spark » Structured Streaming Programming]
* [https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html Spark » Structured Streaming Programming]
* [https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp Spark » Apache Spark, Hive & Spring Boot]
* [https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp Spark » Apache Spark, Hive & Spring Boot]
Line 66: Line 125:
* [https://stackoverflow.com/questions/59249135/ Spark » Install Slave as a Daemon]
* [https://stackoverflow.com/questions/59249135/ Spark » Install Slave as a Daemon]
* [https://stackoverflow.com/questions/40166056/ Spark » As a Linux Service]
* [https://stackoverflow.com/questions/40166056/ Spark » As a Linux Service]
* [https://camel.apache.org/components/3.18.x/spark-component.html Spark » Apache Camel]
* [https://www.baeldung.com/apache-spark Spark » Introduction]
* [https://www.baeldung.com/apache-spark Spark » Introduction]
* [[Machine Learning]]
* [https://github.com/vim89/spark-spring-boot/blob/master/src/main/java/com/vitthalmirji/spring/spark/ApplicationConfig.java Spark » Spring Boot]
* [https://hive.apache.org/ Apache Hive]
* [https://aerospike.com/ Aerospike]


| valign="top" |
| valign="top" |
Line 88: Line 146:
|-
|-
| valign="top" |
| valign="top" |
* [https://www.linode.com/docs/guides/install-configure-run-spark-on-top-of-hadoop-yarn-cluster/ Spark » Running Spark on Top of a Hadoop YARN]
* [https://spark.apache.org/docs/latest/cluster-overview.html Spark » Deploy » Cluster Mode Overview]
* [https://spark.apache.org/docs/latest/cluster-overview.html Spark » Deploy » Cluster Mode Overview]
* [https://spark.apache.org/docs/latest/spark-standalone.html Spark » Deploy » Standalone Mode]
* [https://spark.apache.org/docs/latest/spark-standalone.html Spark » Deploy » Standalone Mode]
* [https://spark.apache.org/docs/latest/submitting-applications.html Spark » Deploy » Spark Submit]
* [https://spark.apache.org/docs/latest/submitting-applications.html Spark » Deploy » Spark Submit]
* [https://spark.apache.org/docs/latest/running-on-kubernetes.html Spark » Deploy » Kubernetes]
* [https://spark.apache.org/docs/latest/running-on-kubernetes.html Spark » Deploy » Kubernetes]
* [https://medium.com/ymedialabs-innovation/apache-spark-on-a-multi-node-cluster-b75967c8cb2b Spark » Multi-Node Cluster]
* [https://spark.apache.org/docs/latest/cluster-overview.html Spark » Deploy » Cluster]
* [https://spark.apache.org/docs/latest/cluster-overview.html Spark » Deploy » Cluster]
* [https://spark.apache.org/docs/latest/running-on-mesos.html Spark » Deploy » Mesos]
* [https://spark.apache.org/docs/latest/running-on-mesos.html Spark » Deploy » Mesos]
Line 97: Line 157:


| valign="top" |
| valign="top" |
* [https://stackoverflow.com/questions/19943766/ Spark » Unable to load native-hadoop library]
* [https://arrow.apache.org/cookbook/java/ Spark » Apache Arrow Java Cookbook]
* [https://community.cloudera.com/t5/Support-Questions/Spark-access-remote-HDFS-in-cross-realm-trust-setup/td-p/87813 Spark » Access remote HDFS]
* [https://spark.apache.org/docs/latest/api/python/index.html Spark » API Docs » Python]
* [https://spark.apache.org/docs/latest/api/python/index.html Spark » API Docs » Python]
* [https://spark.apache.org/docs/latest/api/scala/org/apache/spark/index.html Spark » API Docs » Scala]
* [https://spark.apache.org/docs/latest/api/scala/org/apache/spark/index.html Spark » API Docs » Scala]
Line 102: Line 165:
* [https://spark.apache.org/docs/latest/api/sql/index.html Spark » API Docs » SQL]
* [https://spark.apache.org/docs/latest/api/sql/index.html Spark » API Docs » SQL]
* [https://spark.apache.org/docs/latest/api/R/index.html Spark » API Docs » R]
* [https://spark.apache.org/docs/latest/api/R/index.html Spark » API Docs » R]
| valign="top" |
* [https://dev.to/aws-builders/ssh-setup-and-tunneling-via-bastion-host-3kcc AWS » Bastion Host SSH Tunneling]
* [https://pub.towardsai.net/how-to-set-up-your-environment-for-spark-7820b84491ef Set Up Environment for Spark]
* [[Machine Learning]]
* [[Linux Containers]]
* [https://hive.apache.org/ Apache Hive]
* [https://aerospike.com/ Aerospike]
* [[Hadoop]]
* [[Jupyter]]
* [[NLP]]
|-
| colspan="3" |
----
|-
| valign="top" |
* [https://superuser.com/questions/513159/ Systemd » Safe Remove Services]
* [[Bastion SSH Tunneling]]
| valign="top" |


| valign="top" |
| valign="top" |


|}
|}

Latest revision as of 10:18, 25 January 2024

export PYSPARK_PYTHON='/usr/bin/python3';\
export SPARK_HOME='/opt/cli/spark-3.3.0-bin-hadoop3';\
export JAVA_HOME='/usr/lib/jvm/java-17-openjdk-amd64';\
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
spark-shell
pyspark
http://localhost:8080/
http://localhost:7077/
http://localhost:4040/
ssh -L 8080:localhost:8080 [email protected]
ssh -L 7077:localhost:7077 [email protected]

Master Node

sudo apt -qq update;\
export PYSPARK_PYTHON='/usr/bin/python3';\
export SPARK_HOME='/opt/cli/spark-3.3.0-bin-hadoop3';\
export JAVA_HOME='/usr/lib/jvm/java-17-openjdk-arm64';\
bash <(curl -s 'https://cdn.chorke.org/exec/cli/bash/install/apache-spark-master/3.3.0.sh.txt')
sudo systemctl daemon-reload
sudo systemctl enable spark-master.service
sudo systemctl start  spark-master.service
sudo systemctl status spark-master.service

Worker Node

sudo apt -qq update;\
export PYSPARK_PYTHON='/usr/bin/python3';\
export SPARK_MASTER='spark://ns12-pc04:7077';\
export SPARK_HOME='/opt/cli/spark-3.3.0-bin-hadoop3';\
export JAVA_HOME='/usr/lib/jvm/java-17-openjdk-amd64';\
bash <(curl -s 'https://cdn.chorke.org/exec/cli/bash/install/apache-spark-slave/3.3.0.sh.txt')
sudo systemctl daemon-reload
sudo systemctl enable spark-slave.service
sudo systemctl start  spark-slave.service
sudo systemctl status spark-slave.service

Knowledge

ssh-keygen -b 4096 -t rsa -f ~/.ssh/id_rsa -q -N "spark@${HOSTNAME}"
readlink -f /usr/bin/java | sed "s:bin/java::"
sudo apt-get install pdsh
sudo apt-get install ssh

su -h spark
sudo -u spark -H sh -c "whoami; echo ${HOME}"
sh $SPARK_HOME/bin/spark-shell
sh $SPARK_HOME/bin/pyspark
http://127.0.0.1:8080
http://127.0.0.1:8088
http://127.0.0.1:9870
http://127.0.0.1:4040

sudo apt dist-upgrade
sudo do-release-upgrade

sudo apt --fix-broken install
sudo apt install ubuntu-desktop
[Service]
User=spark
Group=spark
Type=forking
SuccessExitStatus=143

if [ -f '/etc/os-release' ];then
         HOST_OS_ID=$(grep -oP '(?<=^ID=).+'         /etc/os-release | tr -d '"')
    HOST_OS_ID_LIKE=$(grep -oP '(?<=^ID_LIKE=).+'    /etc/os-release | tr -d '"')
    HOST_OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
fi

References