Hadoop: Difference between revisions
Jump to navigation
Jump to search
(Created page with "==References== {| | valign="top" | * [https://archive.apache.org/dist/hadoop/common/ Hadoop » Download » Archive] * [https://archive.apache.org/dist/hadoop/common/current/ H...") |
|||
(72 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
'''Hadoop''' is a '''''Java-based programming framework''''' that supports the processing and storage of extremely large datasets on a cluster of inexpensive machines. It was the first major open source project in the big data playing field and is sponsored by the Apache Software Foundation. Hadoop is comprised of four main layers: | |||
# '''Hadoop Common''' is the collection of utilities and libraries that support other Hadoop modules. | |||
# '''HDFS''', which stands for '''''Hadoop Distributed File System''''', is responsible for persisting data to disk. | |||
# '''YARN''', short for Yet Another Resource Negotiator, is the '''''"operating system"''''' for HDFS. | |||
# '''MapReduce''' is the original processing model for Hadoop clusters. It distributes work within the cluster or map, then organizes and reduces the results from the nodes into a response to a query. Many other processing models are available for the 3.x version of Hadoop | |||
==Configuration== | |||
<source lang="bash"> | |||
mkdir -p /home/hadoop/hdfs/{datanode,namenode}/ | |||
sudo tee -a $HADOOP_HOME/etc/hadoop/core-site.xml >/dev/null <<EOF | |||
<configuration> | |||
<property> | |||
<name>hadoop.tmp.dir</name> | |||
<value>/home/hadoop/tmp</value> | |||
</property> | |||
<property> | |||
<name>fs.default.name</name> | |||
<value>hdfs://0.0.0.0:9000</value> | |||
<description>The default file system URI</description> | |||
</property> | |||
</configuration> | |||
EOF | |||
</source> | |||
---- | |||
<source lang="bash"> | |||
mkdir -p /home/hadoop/hdfs/{datanode,namenode} | |||
sudo tee -a $HADOOP_HOME/etc/hadoop/hdfs-site.xml >/dev/null <<EOF | |||
<configuration> | |||
<property> | |||
<name>dfs.replication</name> | |||
<value>1</value> | |||
</property> | |||
<property> | |||
<name>dfs.namenode.name.dir</name> | |||
<value>/home/hadoop/hdfs/namenode/</value> | |||
</property> | |||
<property> | |||
<name>dfs.datanode.data.dir</name> | |||
<value>/home/hadoop/hdfs/datanode/</value> | |||
</property> | |||
</configuration> | |||
EOF | |||
</source> | |||
---- | |||
<source lang="bash"> | |||
sudo tee -a $HADOOP_HOME/etc/hadoop/mapred-site.xml >/dev/null <<EOF | |||
<configuration> | |||
<property> | |||
<name>mapreduce.framework.name</name> | |||
<value>yarn</value> | |||
</property> | |||
</configuration> | |||
EOF | |||
</source> | |||
---- | |||
<source lang="bash"> | |||
sudo tee -a $HADOOP_HOME/etc/hadoop/yarn-site.xml >/dev/null <<EOF | |||
<configuration> | |||
<property> | |||
<name>yarn.nodemanager.aux-services</name> | |||
<value>mapreduce_shuffle</value> | |||
</property> | |||
</configuration> | |||
EOF | |||
</source> | |||
==Unit Testing== | |||
{| | |||
|valign="top"| | |||
<source lang='bash'> | |||
lxc launch images:debian/12 agronomy && | |||
lxc exec agronomy bash <<'EOF' | |||
sleep 5 | |||
apt-get install -y curl wget openjdk-11-jre\ | |||
openssh-client openssh-server | |||
systemctl daemon-reload | |||
systemctl restart sshd | |||
systemctl status sshd | |||
java -version | |||
EOF | |||
</source> | |||
|valign="top"| | |||
<source lang='bash'> | |||
lxc launch images:fedora/37 robotics && | |||
lxc exec robotics bash <<'EOF' | |||
sleep 5 | |||
dnf install -y curl wget java-11-openjdk\ | |||
openssh-client openssh-server | |||
systemctl daemon-reload | |||
systemctl restart sshd | |||
systemctl status sshd | |||
java -version | |||
EOF | |||
</source> | |||
|valign="top"| | |||
<source lang='bash'> | |||
lxc launch images:ubuntu/22.04 software && | |||
lxc exec software bash <<'EOF' | |||
sleep 5 | |||
apt-get install -y curl wget openjdk-11-jre\ | |||
openssh-client openssh-server | |||
systemctl daemon-reload | |||
systemctl restart sshd | |||
systemctl status sshd | |||
java -version | |||
EOF | |||
</source> | |||
|- | |||
|colspan="3"| | |||
---- | |||
|- | |||
|valign="top"| | |||
'''create alias:''' | |||
lxc stop agronomy | |||
lxc publish agronomy --alias\ | |||
debian/12:java:ssh | |||
'''create alias from snapshot:''' | |||
lxc snapshot '''agronomy java:ssh''' | |||
lxc publish '''agronomy/java:ssh''' --alias\ | |||
debian/12:java:ssh | |||
lxc delete agronomy | |||
'''launch alias:''' | |||
lxc launch '''debian/12:java:ssh''' agronomy && | |||
lxc exec agronomy bash | |||
lxc stop agronomy && lxc delete agronomy | |||
|valign="top"| | |||
'''create alias:''' | |||
lxc stop robotics | |||
lxc publish robotics --alias\ | |||
fedora/37:java:ssh | |||
'''create alias from snapshot:''' | |||
lxc snapshot '''robotics java:ssh''' | |||
lxc publish '''robotics/java:ssh''' --alias\ | |||
fedora/37:java:ssh | |||
lxc delete robotics | |||
'''launch alias:''' | |||
lxc launch '''fedora/37:java:ssh''' robotics && | |||
lxc exec robotics bash | |||
lxc stop robotics && lxc delete robotics | |||
|valign="top"| | |||
'''create alias:''' | |||
lxc stop software | |||
lxc publish software --alias\ | |||
ubuntu/22.04:java:ssh | |||
'''create alias from snapshot:''' | |||
lxc snapshot '''software java:ssh''' | |||
lxc publish '''software/java:ssh''' --alias\ | |||
ubuntu/22.04:java:ssh | |||
lxc delete software | |||
'''launch alias:''' | |||
lxc launch '''ubuntu/22.04:java:ssh''' software && | |||
lxc exec software bash | |||
lxc stop software && lxc delete software | |||
|} | |||
== Knowledge == | |||
{| | |||
|valign="top" colspan="3"| | |||
ssh-keygen -b 4096 -t rsa -f ~/.ssh/id_rsa -q -N "hadoop@${HOSTNAME}" | |||
readlink -f /usr/bin/java | sed "s:bin/java::" | |||
sudo apt-get install pdsh | |||
sudo apt-get install ssh | |||
|- | |||
|colspan="3"| | |||
---- | |||
|- | |||
|valign="top" colspan="3"| | |||
su -h hadoop | |||
hdfs namenode -format | |||
sudo -u haddop -H sh -c "whoami; echo ${HOME}" | |||
sh $HADOOP_HOME/sbin/start-dfs.sh | |||
http://127.0.0.1:9870 | |||
sh $HADOOP_HOME/sbin/start-yarn.sh | |||
http://127.0.0.1:8088 | |||
|- | |||
|colspan="3"| | |||
---- | |||
|- | |||
|valign="bottom" colspan="2"| | |||
sudo apt dist-upgrade | |||
sudo do-release-upgrade | |||
sudo apt --fix-broken install | |||
sudo apt install ubuntu-desktop | |||
|valign="top"| | |||
<source lang="ini"> | |||
[Service] | |||
User=hadoop | |||
Group=hadoop | |||
Type=forking | |||
SuccessExitStatus=143 | |||
</source> | |||
|- | |||
|colspan="3"| | |||
---- | |||
|- | |||
|valign="top" colspan="3"| | |||
<source lang="bash"> | |||
if [ -f '/etc/os-release' ];then | |||
HOST_OS_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') | |||
HOST_OS_ID_LIKE=$(grep -oP '(?<=^ID_LIKE=).+' /etc/os-release | tr -d '"') | |||
HOST_OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') | |||
fi | |||
</source> | |||
|- | |||
|colspan="3"| | |||
---- | |||
|- | |||
|valign="top" colspan="3"| | |||
<source lang='bash'> | |||
declare -a HADOOP_SCHEDULERS=(dev prod);\ | |||
declare -A MINMAX_CAPACITIES=([dev]='50 50' [prod]='50 70');\ | |||
for HADOOP_SCHEDULER in ${HADOOP_SCHEDULERS[@]};do \ | |||
for MINMAX_CAPACITY in ${MINMAX_CAPACITIES[${HADOOP_SCHEDULER}]};do \ | |||
echo "${HADOOP_SCHEDULER} => ${MINMAX_CAPACITY}";\ | |||
done;\ | |||
done | |||
</source> | |||
|} | |||
==References== | ==References== | ||
{| | {| | ||
| valign="top" | | | valign="top" | | ||
* [https://www.digitalocean.com/community/tutorials/how-to-install-hadoop-in-stand-alone-mode-on-ubuntu-20-04 Hadoop » Install Standalone Mode on Ubuntu 20.04] | |||
* [https://www.vultr.com/docs/install-and-configure-apache-hadoop-on-ubuntu-20-04/ Hadoop » Install & Configure on Ubuntu 20.04] | |||
* [https://www.digitalocean.com/community/tutorials/an-introduction-to-big-data-concepts-and-terminology Hadoop » Big Data Concepts & Terminology] | |||
* [https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html Hadoop » Setting up a Single Node Cluster] | |||
* [https://stackoverflow.com/questions/40398280/ Hadoop » Install as a Daemon] | |||
* [https://archive.apache.org/dist/hadoop/common/ Hadoop » Download » Archive] | * [https://archive.apache.org/dist/hadoop/common/ Hadoop » Download » Archive] | ||
* [https://archive.apache.org/dist/hadoop/common/current/ Hadoop » Download » Current] | * [https://archive.apache.org/dist/hadoop/common/current/ Hadoop » Download » Current] | ||
* [https://archive.apache.org/dist/hadoop/common/stable/ Hadoop » Download » Stable] | * [https://archive.apache.org/dist/hadoop/common/stable/ Hadoop » Download » Stable] | ||
* [https://www.digitalocean.com/community/tutorials/an-introduction-to-hadoop Hadoop » An Introduction] | |||
* [https://hadoop.apache.org/docs/stable/ Hadoop » Docs » Stable] | |||
| valign="top" | | |||
* [https://hadoop.apache.org/docs/r3.3.4/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml Hadoop » <code>hadoop-mapreduce-client-core/mapred-default.xml</code>] | |||
* [https://hadoop.apache.org/docs//r3.3.4/hadoop-yarn/hadoop-yarn-common/yarn-default.xml Hadoop » <code>hadoop-yarn-common/yarn-default.xml</code>] | |||
* [https://www.linode.com/docs/guides/install-configure-run-spark-on-top-of-hadoop-yarn-cluster/ Hadoop » Running Spark on Top of a Hadoop YARN] | |||
* [https://hadoop.apache.org/docs/r3.3.4/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml Hadoop » <code>hadoop-hdfs/hdfs-default.xml</code>] | |||
* [https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions Hadoop » Java Versions] | |||
* [[VS Code on iPad Pro]] | |||
* [[Machine Learning]] | * [[Machine Learning]] | ||
* [[Jupyter]] | * [[Jupyter]] | ||
* [[Spark]] | |||
* [[NLP]] | |||
| valign="top" | | |||
* [https://www.ibm.com/docs/en/spectrum-scale-bda?topic=requirements-firewall-recommendations-hdfs-transparency Hadoop » Firewall Transparency] | |||
* [https://docs.cloudera.com/HDPDocuments/HDP3/HDP-3.0.0/administration/content/hdfs-ports.html Hadoop » Administering HDFS] | |||
* [https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.3.0-Win/bk_HDP_Install_Win/content/ref-79239257-778e-42a9-9059-d982d0c08885.1.html Hadoop » Configure Ports] | |||
* [https://hadoop.apache.org/docs/r3.3.4/hadoop-project-dist/hadoop-common/ClusterSetup.html Hadoop » Cluster Setup] | |||
* [https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions Hadoop » Java Versions] | |||
* [https://en.wikipedia.org/wiki/Hortonworks Hadoop » Hortonworks] | |||
* [https://en.wikipedia.org/wiki/Cloudera Hadoop » Cloudera] | |||
|- | |||
| colspan="3" | | |||
---- | |||
|- | |||
| valign="top" | | |||
* [https://unix.stackexchange.com/questions/11544/ Bash » Difference between <code>/opt</code> and <code>/usr/local</code>] | |||
* [https://stackoverflow.com/questions/1988249/ Bash » Switch user & execute remaining script] | |||
* [https://unix.stackexchange.com/questions/291454/ Difference between sudo user vs. root user] | |||
* [https://jumpcloud.com/blog/how-to-create-a-new-sudo-user-manage-sudo-access-on-ubuntu-20-04 Create a Sudo User & Manage Access] | |||
* [https://unix.stackexchange.com/questions/69314/ Bash » Non-interactive <code>ssh-keygen</code>] | |||
* [https://superuser.com/questions/468161/ Bash » Switch user & execute script] | |||
* [https://unix.stackexchange.com/questions/28526/ Bash » Add a user if it doesn't exist] | |||
* [[Sed Replace A Multi-Line String]] | |||
* [https://serverfault.com/questions/283129/ SSH Connection Hang Forever] | |||
* [https://pub.towardsai.net/how-to-set-up-your-environment-for-spark-7820b84491ef Set Up Environment for Spark] | |||
| valign="top" | | | valign="top" | | ||
* [https://dev.to/aws-builders/ssh-setup-and-tunneling-via-bastion-host-3kcc AWS » Bastion Host SSH Tunneling] | |||
* [https://superuser.com/questions/513159/ Systemd » Safe Remove Services] | |||
* [https://www.datacamp.com/tutorial/installation-of-pyspark Installation of PySpark] | |||
* [[Bastion SSH Tunneling]] | |||
* [[Linux Containers]] | |||
| valign="top" | | | valign="top" | | ||
|} | |} |
Latest revision as of 10:18, 25 January 2024
Hadoop is a Java-based programming framework that supports the processing and storage of extremely large datasets on a cluster of inexpensive machines. It was the first major open source project in the big data playing field and is sponsored by the Apache Software Foundation. Hadoop is comprised of four main layers:
- Hadoop Common is the collection of utilities and libraries that support other Hadoop modules.
- HDFS, which stands for Hadoop Distributed File System, is responsible for persisting data to disk.
- YARN, short for Yet Another Resource Negotiator, is the "operating system" for HDFS.
- MapReduce is the original processing model for Hadoop clusters. It distributes work within the cluster or map, then organizes and reduces the results from the nodes into a response to a query. Many other processing models are available for the 3.x version of Hadoop
Configuration
mkdir -p /home/hadoop/hdfs/{datanode,namenode}/
sudo tee -a $HADOOP_HOME/etc/hadoop/core-site.xml >/dev/null <<EOF
<configuration>
<property>
<name>hadoop.tmp.dir</name>
<value>/home/hadoop/tmp</value>
</property>
<property>
<name>fs.default.name</name>
<value>hdfs://0.0.0.0:9000</value>
<description>The default file system URI</description>
</property>
</configuration>
EOF
mkdir -p /home/hadoop/hdfs/{datanode,namenode}
sudo tee -a $HADOOP_HOME/etc/hadoop/hdfs-site.xml >/dev/null <<EOF
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/home/hadoop/hdfs/namenode/</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/home/hadoop/hdfs/datanode/</value>
</property>
</configuration>
EOF
sudo tee -a $HADOOP_HOME/etc/hadoop/mapred-site.xml >/dev/null <<EOF
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
EOF
sudo tee -a $HADOOP_HOME/etc/hadoop/yarn-site.xml >/dev/null <<EOF
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
EOF
Unit Testing
lxc launch images:debian/12 agronomy &&
lxc exec agronomy bash <<'EOF'
sleep 5
apt-get install -y curl wget openjdk-11-jre\
openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF
|
lxc launch images:fedora/37 robotics &&
lxc exec robotics bash <<'EOF'
sleep 5
dnf install -y curl wget java-11-openjdk\
openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF
|
lxc launch images:ubuntu/22.04 software &&
lxc exec software bash <<'EOF'
sleep 5
apt-get install -y curl wget openjdk-11-jre\
openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF
|
| ||
create alias: lxc stop agronomy lxc publish agronomy --alias\ debian/12:java:ssh create alias from snapshot: lxc snapshot agronomy java:ssh lxc publish agronomy/java:ssh --alias\ debian/12:java:ssh lxc delete agronomy launch alias: lxc launch debian/12:java:ssh agronomy && lxc exec agronomy bash lxc stop agronomy && lxc delete agronomy |
create alias: lxc stop robotics lxc publish robotics --alias\ fedora/37:java:ssh create alias from snapshot: lxc snapshot robotics java:ssh lxc publish robotics/java:ssh --alias\ fedora/37:java:ssh lxc delete robotics launch alias: lxc launch fedora/37:java:ssh robotics && lxc exec robotics bash lxc stop robotics && lxc delete robotics |
create alias: lxc stop software lxc publish software --alias\ ubuntu/22.04:java:ssh create alias from snapshot: lxc snapshot software java:ssh lxc publish software/java:ssh --alias\ ubuntu/22.04:java:ssh lxc delete software launch alias: lxc launch ubuntu/22.04:java:ssh software && lxc exec software bash lxc stop software && lxc delete software |
Knowledge
ssh-keygen -b 4096 -t rsa -f ~/.ssh/id_rsa -q -N "hadoop@${HOSTNAME}" readlink -f /usr/bin/java | sed "s:bin/java::" sudo apt-get install pdsh sudo apt-get install ssh | ||
| ||
su -h hadoop hdfs namenode -format sudo -u haddop -H sh -c "whoami; echo ${HOME}" sh $HADOOP_HOME/sbin/start-dfs.sh http://127.0.0.1:9870 sh $HADOOP_HOME/sbin/start-yarn.sh http://127.0.0.1:8088 | ||
| ||
sudo apt dist-upgrade sudo do-release-upgrade sudo apt --fix-broken install sudo apt install ubuntu-desktop |
[Service]
User=hadoop
Group=hadoop
Type=forking
SuccessExitStatus=143
| |
| ||
if [ -f '/etc/os-release' ];then
HOST_OS_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
HOST_OS_ID_LIKE=$(grep -oP '(?<=^ID_LIKE=).+' /etc/os-release | tr -d '"')
HOST_OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
fi
| ||
| ||
declare -a HADOOP_SCHEDULERS=(dev prod);\
declare -A MINMAX_CAPACITIES=([dev]='50 50' [prod]='50 70');\
for HADOOP_SCHEDULER in ${HADOOP_SCHEDULERS[@]};do \
for MINMAX_CAPACITY in ${MINMAX_CAPACITIES[${HADOOP_SCHEDULER}]};do \
echo "${HADOOP_SCHEDULER} => ${MINMAX_CAPACITY}";\
done;\
done
|