Spark: Difference between revisions
Jump to navigation
Jump to search
(47 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
<source lang="bash"> | <source lang="bash"> | ||
export PYSPARK_PYTHON='/usr/bin/python3';\ | |||
export SPARK_HOME='/opt/cli/spark-3.3.0-bin-hadoop3';\ | |||
export JAVA_HOME='/usr/lib/jvm/java-17-openjdk-amd64';\ | |||
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin | |||
</source> | |||
spark-shell | |||
pyspark | |||
http://localhost:8080/ | |||
http://localhost:7077/ | |||
http://localhost:4040/ | |||
ssh -L 8080:localhost:8080 user@spark.chorke.org | |||
ssh -L 7077:localhost:7077 user@spark.chorke.org | |||
==Master Node== | |||
<source lang="bash"> | |||
sudo apt -qq update;\ | |||
export PYSPARK_PYTHON='/usr/bin/python3';\ | |||
export SPARK_HOME='/opt/cli/spark-3.3.0-bin-hadoop3';\ | |||
export JAVA_HOME='/usr/lib/jvm/java-17-openjdk-arm64';\ | |||
bash <(curl -s 'https://cdn.chorke.org/exec/cli/bash/install/apache-spark-master/3.3.0.sh.txt') | |||
</source> | </source> | ||
== | sudo systemctl daemon-reload | ||
sudo systemctl enable spark-master.service | |||
sudo systemctl start spark-master.service | |||
sudo systemctl status spark-master.service | |||
==Worker Node== | |||
<source lang="bash"> | <source lang="bash"> | ||
sudo | sudo apt -qq update;\ | ||
export PYSPARK_PYTHON='/usr/bin/python3';\ | |||
export SPARK_MASTER='spark://ns12-pc04:7077';\ | |||
export SPARK_HOME='/opt/cli/spark-3.3.0-bin-hadoop3';\ | |||
export JAVA_HOME='/usr/lib/jvm/java-17-openjdk-amd64';\ | |||
bash <(curl -s 'https://cdn.chorke.org/exec/cli/bash/install/apache-spark-slave/3.3.0.sh.txt') | |||
</source> | |||
sudo systemctl daemon-reload | |||
sudo systemctl enable spark-slave.service | |||
sudo systemctl start spark-slave.service | |||
sudo systemctl status spark-slave.service | |||
== Knowledge == | |||
{| | |||
|valign="top" colspan="3"| | |||
ssh-keygen -b 4096 -t rsa -f ~/.ssh/id_rsa -q -N "spark@${HOSTNAME}" | |||
readlink -f /usr/bin/java | sed "s:bin/java::" | |||
sudo apt-get install pdsh | |||
sudo apt-get install ssh | |||
|- | |||
|colspan="3"| | |||
---- | |||
|- | |||
|valign="top" colspan="3"| | |||
su -h spark | |||
sudo -u spark -H sh -c "whoami; echo ${HOME}" | |||
sh $SPARK_HOME/bin/spark-shell | |||
sh $SPARK_HOME/bin/pyspark | |||
http://127.0.0.1:8080 | |||
http://127.0.0.1:8088 | |||
http://127.0.0.1:9870 | |||
http://127.0.0.1:4040 | |||
|- | |||
|colspan="3"| | |||
---- | |||
|- | |||
|valign="bottom" colspan="2"| | |||
sudo apt dist-upgrade | |||
sudo do-release-upgrade | |||
sudo apt --fix-broken install | |||
sudo apt install ubuntu-desktop | |||
|valign="top"| | |||
<source lang="ini"> | |||
[Service] | [Service] | ||
User=spark | User=spark | ||
Group=spark | |||
Type=forking | Type=forking | ||
SuccessExitStatus=143 | |||
</source> | |||
[ | |- | ||
|colspan="3"| | |||
---- | |||
|- | |||
|valign="top" colspan="3"| | |||
<source lang="bash"> | |||
if [ -f '/etc/os-release' ];then | |||
HOST_OS_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') | |||
HOST_OS_ID_LIKE=$(grep -oP '(?<=^ID_LIKE=).+' /etc/os-release | tr -d '"') | |||
HOST_OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') | |||
fi | |||
</source> | </source> | ||
|} | |||
==References== | ==References== | ||
{| | {| | ||
| valign="top" | | | valign="top" | | ||
* [https://stackoverflow.com/questions/73853179/ Spark » Systemd » Failed to stop Master/Slave Node] | |||
* [https://archive.apache.org/dist/spark/ Spark » Download » Archive] | * [https://archive.apache.org/dist/spark/ Spark » Download » Archive] | ||
* [https://stackoverflow.com/questions/36083783/ Spark » Change WebUI Port] | |||
* [https://spark.apache.org/sql/ Spark » SQL & DataFrames] | * [https://spark.apache.org/sql/ Spark » SQL & DataFrames] | ||
* [https://spark.apache.org/mllib/ Spark » Machine Learning] | * [https://spark.apache.org/mllib/ Spark » Machine Learning] | ||
Line 55: | Line 115: | ||
* [https://spark.apache.org/examples.html Spark » Examples] | * [https://spark.apache.org/examples.html Spark » Examples] | ||
* [https://spark.apache.org/graphx/ Spark » GraphX] | * [https://spark.apache.org/graphx/ Spark » GraphX] | ||
* [ | * [https://cdn.chorke.org/exec/cli/bash/install/ Spark » Install] | ||
| valign="top" | | | valign="top" | | ||
* [https://stackoverflow.com/questions/31450846/ Spark » Concatenate columns in DataFrame] | |||
* [https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html Spark » Structured Streaming Programming] | * [https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html Spark » Structured Streaming Programming] | ||
* [https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp Spark » Apache Spark, Hive & Spring Boot] | * [https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp Spark » Apache Spark, Hive & Spring Boot] | ||
* [https://github.com/SimonHarmonicMinor/apache-spark-integration-testing-example Spark » Integration Testing Example] | * [https://github.com/SimonHarmonicMinor/apache-spark-integration-testing-example Spark » Integration Testing Example] | ||
* [https://github.com/juju-solutions/layer-apache-spark/blob/master/templates/master-systemd.conf Spark » Install Master as a Daemon] | |||
* [https://stackoverflow.com/questions/59249135/ Spark » Install Slave as a Daemon] | |||
* [https://stackoverflow.com/questions/40166056/ Spark » As a Linux Service] | |||
* [https://camel.apache.org/components/3.18.x/spark-component.html Spark » Apache Camel] | |||
* [https://www.baeldung.com/apache-spark Spark » Introduction] | |||
* [https://github.com/vim89/spark-spring-boot/blob/master/src/main/java/com/vitthalmirji/spring/spark/ApplicationConfig.java Spark » Spring Boot] | |||
| valign="top" | | |||
* [https://stackoverflow.com/questions/61472322/ KillSignal interact with TimeoutStopSec] | |||
* [https://unix.stackexchange.com/questions/388483/ Delay Systemd Service If File Exist] | |||
* [https://www.freedesktop.org/software/systemd/man/systemd.unit.html#Conditions%20and%20Asserts System Unit Conditions & Asserts] | |||
* [[VS Code on iPad Pro]] | * [[VS Code on iPad Pro]] | ||
* [[ | * [[Apache Camel]] | ||
* [ | * [[Mosquitto]] | ||
* [ | * [[ActiveMQ]] | ||
* [[Keycloak]] | * [[Keycloak]] | ||
* [[GraphQL]] | * [[GraphQL]] | ||
* [[Grafana]] | * [[Grafana]] | ||
|- | |- | ||
Line 81: | Line 146: | ||
|- | |- | ||
| valign="top" | | | valign="top" | | ||
* [https://www.linode.com/docs/guides/install-configure-run-spark-on-top-of-hadoop-yarn-cluster/ Spark » Running Spark on Top of a Hadoop YARN] | |||
* [https://spark.apache.org/docs/latest/cluster-overview.html Spark » Deploy » Cluster Mode Overview] | * [https://spark.apache.org/docs/latest/cluster-overview.html Spark » Deploy » Cluster Mode Overview] | ||
* [https://spark.apache.org/docs/latest/spark-standalone.html Spark » Deploy » Standalone Mode] | * [https://spark.apache.org/docs/latest/spark-standalone.html Spark » Deploy » Standalone Mode] | ||
* [https://spark.apache.org/docs/latest/submitting-applications.html Spark » Deploy » Spark Submit] | * [https://spark.apache.org/docs/latest/submitting-applications.html Spark » Deploy » Spark Submit] | ||
* [https://spark.apache.org/docs/latest/running-on-kubernetes.html Spark » Deploy » Kubernetes] | * [https://spark.apache.org/docs/latest/running-on-kubernetes.html Spark » Deploy » Kubernetes] | ||
* [https://medium.com/ymedialabs-innovation/apache-spark-on-a-multi-node-cluster-b75967c8cb2b Spark » Multi-Node Cluster] | |||
* [https://spark.apache.org/docs/latest/cluster-overview.html Spark » Deploy » Cluster] | * [https://spark.apache.org/docs/latest/cluster-overview.html Spark » Deploy » Cluster] | ||
* [https://spark.apache.org/docs/latest/running-on-mesos.html Spark » Deploy » Mesos] | * [https://spark.apache.org/docs/latest/running-on-mesos.html Spark » Deploy » Mesos] | ||
Line 90: | Line 157: | ||
| valign="top" | | | valign="top" | | ||
* [https://stackoverflow.com/questions/19943766/ Spark » Unable to load native-hadoop library] | |||
* [https://arrow.apache.org/cookbook/java/ Spark » Apache Arrow Java Cookbook] | |||
* [https://community.cloudera.com/t5/Support-Questions/Spark-access-remote-HDFS-in-cross-realm-trust-setup/td-p/87813 Spark » Access remote HDFS] | |||
* [https://spark.apache.org/docs/latest/api/python/index.html Spark » API Docs » Python] | * [https://spark.apache.org/docs/latest/api/python/index.html Spark » API Docs » Python] | ||
* [https://spark.apache.org/docs/latest/api/scala/org/apache/spark/index.html Spark » API Docs » Scala] | * [https://spark.apache.org/docs/latest/api/scala/org/apache/spark/index.html Spark » API Docs » Scala] | ||
Line 95: | Line 165: | ||
* [https://spark.apache.org/docs/latest/api/sql/index.html Spark » API Docs » SQL] | * [https://spark.apache.org/docs/latest/api/sql/index.html Spark » API Docs » SQL] | ||
* [https://spark.apache.org/docs/latest/api/R/index.html Spark » API Docs » R] | * [https://spark.apache.org/docs/latest/api/R/index.html Spark » API Docs » R] | ||
| valign="top" | | |||
* [https://dev.to/aws-builders/ssh-setup-and-tunneling-via-bastion-host-3kcc AWS » Bastion Host SSH Tunneling] | |||
* [https://pub.towardsai.net/how-to-set-up-your-environment-for-spark-7820b84491ef Set Up Environment for Spark] | |||
* [[Machine Learning]] | |||
* [[Linux Containers]] | |||
* [https://hive.apache.org/ Apache Hive] | |||
* [https://aerospike.com/ Aerospike] | |||
* [[Hadoop]] | |||
* [[Jupyter]] | |||
* [[NLP]] | |||
|- | |||
| colspan="3" | | |||
---- | |||
|- | |||
| valign="top" | | |||
* [https://superuser.com/questions/513159/ Systemd » Safe Remove Services] | |||
* [[Bastion SSH Tunneling]] | |||
| valign="top" | | |||
| valign="top" | | | valign="top" | | ||
|} | |} |
Latest revision as of 10:18, 25 January 2024
export PYSPARK_PYTHON='/usr/bin/python3';\
export SPARK_HOME='/opt/cli/spark-3.3.0-bin-hadoop3';\
export JAVA_HOME='/usr/lib/jvm/java-17-openjdk-amd64';\
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
spark-shell pyspark
http://localhost:8080/ http://localhost:7077/ http://localhost:4040/
ssh -L 8080:localhost:8080 [email protected] ssh -L 7077:localhost:7077 [email protected]
Master Node
sudo apt -qq update;\
export PYSPARK_PYTHON='/usr/bin/python3';\
export SPARK_HOME='/opt/cli/spark-3.3.0-bin-hadoop3';\
export JAVA_HOME='/usr/lib/jvm/java-17-openjdk-arm64';\
bash <(curl -s 'https://cdn.chorke.org/exec/cli/bash/install/apache-spark-master/3.3.0.sh.txt')
sudo systemctl daemon-reload sudo systemctl enable spark-master.service sudo systemctl start spark-master.service sudo systemctl status spark-master.service
Worker Node
sudo apt -qq update;\
export PYSPARK_PYTHON='/usr/bin/python3';\
export SPARK_MASTER='spark://ns12-pc04:7077';\
export SPARK_HOME='/opt/cli/spark-3.3.0-bin-hadoop3';\
export JAVA_HOME='/usr/lib/jvm/java-17-openjdk-amd64';\
bash <(curl -s 'https://cdn.chorke.org/exec/cli/bash/install/apache-spark-slave/3.3.0.sh.txt')
sudo systemctl daemon-reload sudo systemctl enable spark-slave.service sudo systemctl start spark-slave.service sudo systemctl status spark-slave.service
Knowledge
ssh-keygen -b 4096 -t rsa -f ~/.ssh/id_rsa -q -N "spark@${HOSTNAME}" readlink -f /usr/bin/java | sed "s:bin/java::" sudo apt-get install pdsh sudo apt-get install ssh | ||
| ||
su -h spark sudo -u spark -H sh -c "whoami; echo ${HOME}" sh $SPARK_HOME/bin/spark-shell sh $SPARK_HOME/bin/pyspark http://127.0.0.1:8080 http://127.0.0.1:8088 http://127.0.0.1:9870 http://127.0.0.1:4040 | ||
| ||
sudo apt dist-upgrade sudo do-release-upgrade sudo apt --fix-broken install sudo apt install ubuntu-desktop |
[Service]
User=spark
Group=spark
Type=forking
SuccessExitStatus=143
| |
| ||
if [ -f '/etc/os-release' ];then
HOST_OS_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
HOST_OS_ID_LIKE=$(grep -oP '(?<=^ID_LIKE=).+' /etc/os-release | tr -d '"')
HOST_OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
fi
|