Hadoop: Difference between revisions

From Chorke Wiki
Jump to navigation Jump to search
 
(37 intermediate revisions by the same user not shown)
Line 8: Line 8:
==Configuration==
==Configuration==
<source lang="bash">
<source lang="bash">
mkdir -p /home/hadoop/hdfs/{datanode,namenode}/
sudo tee -a $HADOOP_HOME/etc/hadoop/core-site.xml >/dev/null <<EOF
sudo tee -a $HADOOP_HOME/etc/hadoop/core-site.xml >/dev/null <<EOF
<configuration>
<configuration>
Line 24: Line 25:
----
----
<source lang="bash">
<source lang="bash">
mkdir -p /home/hadoop/hdfs/{datanode,namenode}
sudo tee -a $HADOOP_HOME/etc/hadoop/hdfs-site.xml >/dev/null <<EOF
sudo tee -a $HADOOP_HOME/etc/hadoop/hdfs-site.xml >/dev/null <<EOF
<configuration>
<configuration>
Line 31: Line 33:
     </property>
     </property>
     <property>
     <property>
       <name>dfs.name.dir</name>
       <name>dfs.namenode.name.dir</name>
       <value>file:///home/hadoop/hdfs/namenode</value>
       <value>/home/hadoop/hdfs/namenode/</value>
     </property>
     </property>
     <property>
     <property>
       <name>dfs.data.dir</name>
       <name>dfs.datanode.data.dir</name>
       <value>file:///home/hadoop/hdfs/datanode</value>
       <value>/home/hadoop/hdfs/datanode/</value>
     </property>
     </property>
</configuration>
</configuration>
Line 63: Line 65:
EOF
EOF
</source>
</source>
==Unit Testing==
{|
|valign="top"|
<source lang='bash'>
lxc launch images:debian/12 agronomy &&
lxc exec agronomy bash <<'EOF'
sleep 5
apt-get install -y curl wget openjdk-11-jre\
openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF
</source>
|valign="top"|
<source lang='bash'>
lxc launch images:fedora/37 robotics &&
lxc exec robotics bash <<'EOF'
sleep 5
dnf install -y curl wget java-11-openjdk\
openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF
</source>
|valign="top"|
<source lang='bash'>
lxc launch images:ubuntu/22.04 software &&
lxc exec software bash <<'EOF'
sleep 5
apt-get install -y curl wget openjdk-11-jre\
openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF
</source>
|-
|colspan="3"|
----
|-
|valign="top"|
'''create alias:'''
lxc stop    agronomy
lxc publish  agronomy --alias\
  debian/12:java:ssh
'''create alias from snapshot:'''
lxc snapshot '''agronomy java:ssh'''
lxc publish  '''agronomy/java:ssh''' --alias\
  debian/12:java:ssh
lxc delete  agronomy
'''launch alias:'''
lxc launch  '''debian/12:java:ssh''' agronomy &&
lxc exec    agronomy bash
lxc stop    agronomy && lxc delete agronomy
|valign="top"|
'''create alias:'''
lxc stop    robotics
lxc publish  robotics --alias\
  fedora/37:java:ssh
'''create alias from snapshot:'''
lxc snapshot '''robotics java:ssh'''
lxc publish  '''robotics/java:ssh''' --alias\
  fedora/37:java:ssh
lxc delete  robotics
'''launch alias:'''
lxc launch  '''fedora/37:java:ssh''' robotics &&
lxc exec    robotics bash
lxc stop    robotics && lxc delete robotics
|valign="top"|
'''create alias:'''
lxc stop    software
lxc publish  software --alias\
  ubuntu/22.04:java:ssh
'''create alias from snapshot:'''
lxc snapshot '''software java:ssh'''
lxc publish  '''software/java:ssh''' --alias\
  ubuntu/22.04:java:ssh
lxc delete  software
'''launch alias:'''
lxc launch  '''ubuntu/22.04:java:ssh''' software &&
lxc exec    software bash
lxc stop    software && lxc delete software
|}


== Knowledge ==
== Knowledge ==
{|
{|
|valign="top" colspan="2"|
|valign="top" colspan="3"|
  ssh-keygen -b 4096 -t rsa -f ~/.ssh/id_rsa -q -N "hadoop@academia.chorke.org"
  ssh-keygen -b 4096 -t rsa -f ~/.ssh/id_rsa -q -N "hadoop@${HOSTNAME}"
  readlink -f /usr/bin/java | sed "s:bin/java::"
  readlink -f /usr/bin/java | sed "s:bin/java::"
  sudo apt-get install pdsh
  sudo apt-get install pdsh
Line 73: Line 176:


|-
|-
|colspan="2"|
|colspan="3"|
----
----
|-
|-
|valign="top" colspan="2"|
|valign="top" colspan="3"|
  su -h hadoop
  su -h hadoop
  hdfs namenode -format
  hdfs namenode -format
Line 88: Line 191:


|-
|-
|colspan="2"|
|colspan="3"|
----
----
|-
|-
|valign="bottom"|
|valign="bottom" colspan="2"|
  sudo apt dist-upgrade
  sudo apt dist-upgrade
  sudo do-release-upgrade
  sudo do-release-upgrade
Line 107: Line 210:
</source>
</source>


|-
|colspan="3"|
----
|-
|valign="top" colspan="3"|
<source lang="bash">
if [ -f '/etc/os-release' ];then
        HOST_OS_ID=$(grep -oP '(?<=^ID=).+'        /etc/os-release | tr -d '"')
    HOST_OS_ID_LIKE=$(grep -oP '(?<=^ID_LIKE=).+'    /etc/os-release | tr -d '"')
    HOST_OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
fi
</source>
|-
|colspan="3"|
----
|-
|valign="top" colspan="3"|
<source lang='bash'>
declare -a HADOOP_SCHEDULERS=(dev prod);\
declare -A MINMAX_CAPACITIES=([dev]='50 50' [prod]='50 70');\
for HADOOP_SCHEDULER in ${HADOOP_SCHEDULERS[@]};do \
    for MINMAX_CAPACITY in ${MINMAX_CAPACITIES[${HADOOP_SCHEDULER}]};do \
        echo "${HADOOP_SCHEDULER} => ${MINMAX_CAPACITY}";\
    done;\
done
</source>
|}
|}


Line 124: Line 254:


| valign="top" |
| valign="top" |
* [https://hadoop.apache.org/docs/r3.3.4/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml Hadoop » <code>hadoop-mapreduce-client-core/mapred-default.xml</code>]
* [https://hadoop.apache.org/docs//r3.3.4/hadoop-yarn/hadoop-yarn-common/yarn-default.xml Hadoop » <code>hadoop-yarn-common/yarn-default.xml</code>]
* [https://www.linode.com/docs/guides/install-configure-run-spark-on-top-of-hadoop-yarn-cluster/ Hadoop » Running Spark on Top of a Hadoop YARN]
* [https://www.linode.com/docs/guides/install-configure-run-spark-on-top-of-hadoop-yarn-cluster/ Hadoop » Running Spark on Top of a Hadoop YARN]
* [https://hadoop.apache.org/docs/r3.3.4/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml Hadoop » <code>hadoop-hdfs/hdfs-default.xml</code>]
* [https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions Hadoop » Java Versions]
* [https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions Hadoop » Java Versions]
* [[VS Code on iPad Pro]]
* [[VS Code on iPad Pro]]
Line 133: Line 266:


| valign="top" |
| valign="top" |
* [https://www.ibm.com/docs/en/spectrum-scale-bda?topic=requirements-firewall-recommendations-hdfs-transparency Hadoop » Firewall Transparency]
* [https://docs.cloudera.com/HDPDocuments/HDP3/HDP-3.0.0/administration/content/hdfs-ports.html Hadoop » Administering HDFS]
* [https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.3.0-Win/bk_HDP_Install_Win/content/ref-79239257-778e-42a9-9059-d982d0c08885.1.html Hadoop » Configure Ports]
* [https://hadoop.apache.org/docs/r3.3.4/hadoop-project-dist/hadoop-common/ClusterSetup.html Hadoop » Cluster Setup]
* [https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions Hadoop » Java Versions]
* [https://en.wikipedia.org/wiki/Hortonworks Hadoop » Hortonworks]
* [https://en.wikipedia.org/wiki/Cloudera Hadoop » Cloudera]


|-
|-
Line 146: Line 286:
* [https://superuser.com/questions/468161/ Bash » Switch user & execute script]
* [https://superuser.com/questions/468161/ Bash » Switch user & execute script]
* [https://unix.stackexchange.com/questions/28526/ Bash » Add a user if it doesn't exist]
* [https://unix.stackexchange.com/questions/28526/ Bash » Add a user if it doesn't exist]
* [[Sed Replace A Multi-Line String]]
* [https://serverfault.com/questions/283129/ SSH Connection Hang Forever]
* [https://pub.towardsai.net/how-to-set-up-your-environment-for-spark-7820b84491ef Set Up Environment for Spark]


| valign="top" |
| valign="top" |
* [https://dev.to/aws-builders/ssh-setup-and-tunneling-via-bastion-host-3kcc AWS » Bastion Host SSH Tunneling]
* [https://superuser.com/questions/513159/ Systemd » Safe Remove Services]
* [https://www.datacamp.com/tutorial/installation-of-pyspark Installation of PySpark]
* [[Bastion SSH Tunneling]]
* [[Linux Containers]]


| valign="top" |
| valign="top" |


|}
|}

Latest revision as of 10:18, 25 January 2024

Hadoop is a Java-based programming framework that supports the processing and storage of extremely large datasets on a cluster of inexpensive machines. It was the first major open source project in the big data playing field and is sponsored by the Apache Software Foundation. Hadoop is comprised of four main layers:

  1. Hadoop Common is the collection of utilities and libraries that support other Hadoop modules.
  2. HDFS, which stands for Hadoop Distributed File System, is responsible for persisting data to disk.
  3. YARN, short for Yet Another Resource Negotiator, is the "operating system" for HDFS.
  4. MapReduce is the original processing model for Hadoop clusters. It distributes work within the cluster or map, then organizes and reduces the results from the nodes into a response to a query. Many other processing models are available for the 3.x version of Hadoop

Configuration

mkdir -p /home/hadoop/hdfs/{datanode,namenode}/
sudo tee -a $HADOOP_HOME/etc/hadoop/core-site.xml >/dev/null <<EOF
<configuration>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/home/hadoop/tmp</value>
    </property>
    <property>
        <name>fs.default.name</name>
        <value>hdfs://0.0.0.0:9000</value>
        <description>The default file system URI</description>
    </property>
</configuration>
EOF

mkdir -p /home/hadoop/hdfs/{datanode,namenode}
sudo tee -a $HADOOP_HOME/etc/hadoop/hdfs-site.xml >/dev/null <<EOF
<configuration>
    <property>
      <name>dfs.replication</name>
      <value>1</value>
    </property>
    <property>
      <name>dfs.namenode.name.dir</name>
      <value>/home/hadoop/hdfs/namenode/</value>
    </property>
    <property>
      <name>dfs.datanode.data.dir</name>
      <value>/home/hadoop/hdfs/datanode/</value>
    </property>
</configuration>
EOF

sudo tee -a $HADOOP_HOME/etc/hadoop/mapred-site.xml >/dev/null <<EOF
<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
</configuration>
EOF

sudo tee -a $HADOOP_HOME/etc/hadoop/yarn-site.xml >/dev/null <<EOF
<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
</configuration>
EOF

Unit Testing

lxc launch images:debian/12 agronomy &&
lxc exec agronomy bash <<'EOF'
sleep 5
apt-get install -y curl wget openjdk-11-jre\
 openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF
lxc launch images:fedora/37 robotics &&
lxc exec robotics bash <<'EOF'
sleep 5
dnf install -y curl wget java-11-openjdk\
 openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF
lxc launch images:ubuntu/22.04 software &&
lxc exec software bash <<'EOF'
sleep 5
apt-get install -y curl wget openjdk-11-jre\
 openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF

create alias:
lxc stop     agronomy
lxc publish  agronomy --alias\
 debian/12:java:ssh

create alias from snapshot:
lxc snapshot agronomy java:ssh
lxc publish  agronomy/java:ssh --alias\
 debian/12:java:ssh
lxc delete   agronomy

launch alias:
lxc launch   debian/12:java:ssh agronomy &&
lxc exec     agronomy bash
lxc stop     agronomy && lxc delete agronomy
create alias:
lxc stop     robotics
lxc publish  robotics --alias\
 fedora/37:java:ssh

create alias from snapshot:
lxc snapshot robotics java:ssh
lxc publish  robotics/java:ssh --alias\
 fedora/37:java:ssh
lxc delete   robotics

launch alias:
lxc launch   fedora/37:java:ssh robotics &&
lxc exec     robotics bash
lxc stop     robotics && lxc delete robotics
create alias:
lxc stop     software
lxc publish  software --alias\
 ubuntu/22.04:java:ssh

create alias from snapshot:
lxc snapshot software java:ssh
lxc publish  software/java:ssh --alias\
 ubuntu/22.04:java:ssh
lxc delete   software

launch alias:
lxc launch   ubuntu/22.04:java:ssh software &&
lxc exec     software bash
lxc stop     software && lxc delete software

Knowledge

ssh-keygen -b 4096 -t rsa -f ~/.ssh/id_rsa -q -N "hadoop@${HOSTNAME}"
readlink -f /usr/bin/java | sed "s:bin/java::"
sudo apt-get install pdsh
sudo apt-get install ssh

su -h hadoop
hdfs namenode -format
sudo -u haddop -H sh -c "whoami; echo ${HOME}"
sh $HADOOP_HOME/sbin/start-dfs.sh
http://127.0.0.1:9870
sh $HADOOP_HOME/sbin/start-yarn.sh
http://127.0.0.1:8088

sudo apt dist-upgrade
sudo do-release-upgrade

sudo apt --fix-broken install
sudo apt install ubuntu-desktop
[Service]
User=hadoop
Group=hadoop
Type=forking
SuccessExitStatus=143

if [ -f '/etc/os-release' ];then
         HOST_OS_ID=$(grep -oP '(?<=^ID=).+'         /etc/os-release | tr -d '"')
    HOST_OS_ID_LIKE=$(grep -oP '(?<=^ID_LIKE=).+'    /etc/os-release | tr -d '"')
    HOST_OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
fi

declare -a HADOOP_SCHEDULERS=(dev prod);\
declare -A MINMAX_CAPACITIES=([dev]='50 50' [prod]='50 70');\
for HADOOP_SCHEDULER in ${HADOOP_SCHEDULERS[@]};do \
    for MINMAX_CAPACITY in ${MINMAX_CAPACITIES[${HADOOP_SCHEDULER}]};do \
        echo "${HADOOP_SCHEDULER} => ${MINMAX_CAPACITY}";\
    done;\
done

References