Hadoop: Difference between revisions

Latest revision as of 16:43, 12 December 2024

Hadoop is a Java-based programming framework that supports the processing and storage of extremely large datasets on a cluster of inexpensive machines. It was the first major open source project in the big data playing field and is sponsored by the Apache Software Foundation. Hadoop is comprised of four main layers:

Hadoop Common is the collection of utilities and libraries that support other Hadoop modules.
HDFS, which stands for Hadoop Distributed File System, is responsible for persisting data to disk.
YARN, short for Yet Another Resource Negotiator, is the "operating system" for HDFS.
MapReduce is the original processing model for Hadoop clusters. It distributes work within the cluster or map, then organizes and reduces the results from the nodes into a response to a query. Many other processing models are available for the 3.x version of Hadoop

Configuration

mkdir -p /home/hadoop/hdfs/{datanode,namenode}/
sudo tee -a $HADOOP_HOME/etc/hadoop/core-site.xml >/dev/null <<EOF
<configuration>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/home/hadoop/tmp</value>
    </property>
    <property>
        <name>fs.default.name</name>
        <value>hdfs://0.0.0.0:9000</value>
        <description>The default file system URI</description>
    </property>
</configuration>
EOF

mkdir -p /home/hadoop/hdfs/{datanode,namenode}
sudo tee -a $HADOOP_HOME/etc/hadoop/hdfs-site.xml >/dev/null <<EOF
<configuration>
    <property>
      <name>dfs.replication</name>
      <value>1</value>
    </property>
    <property>
      <name>dfs.namenode.name.dir</name>
      <value>/home/hadoop/hdfs/namenode/</value>
    </property>
    <property>
      <name>dfs.datanode.data.dir</name>
      <value>/home/hadoop/hdfs/datanode/</value>
    </property>
</configuration>
EOF

sudo tee -a $HADOOP_HOME/etc/hadoop/mapred-site.xml >/dev/null <<EOF
<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
</configuration>
EOF

sudo tee -a $HADOOP_HOME/etc/hadoop/yarn-site.xml >/dev/null <<EOF
<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
</configuration>
EOF

Unit Testing

lxc launch images:debian/12 agronomy &&
lxc exec agronomy bash <<'EOF'
sleep 5
apt-get install -y curl wget openjdk-11-jre\
 openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF

lxc launch images:fedora/37 robotics &&
lxc exec robotics bash <<'EOF'
sleep 5
dnf install -y curl wget java-11-openjdk\
 openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF

lxc launch images:ubuntu/22.04 software &&
lxc exec software bash <<'EOF'
sleep 5
apt-get install -y curl wget openjdk-11-jre\
 openssh-client openssh-server
systemctl daemon-reload
systemctl restart sshd
systemctl status sshd
java -version
EOF

create alias:
lxc stop     agronomy
lxc publish  agronomy --alias\
 debian/12:java:ssh

create alias from snapshot:
lxc snapshot agronomy java:ssh
lxc publish  agronomy/java:ssh --alias\
 debian/12:java:ssh
lxc delete   agronomy

launch alias:
lxc launch   debian/12:java:ssh agronomy &&
lxc exec     agronomy bash
lxc stop     agronomy && lxc delete agronomy

create alias:
lxc stop     robotics
lxc publish  robotics --alias\
 fedora/37:java:ssh

create alias from snapshot:
lxc snapshot robotics java:ssh
lxc publish  robotics/java:ssh --alias\
 fedora/37:java:ssh
lxc delete   robotics

launch alias:
lxc launch   fedora/37:java:ssh robotics &&
lxc exec     robotics bash
lxc stop     robotics && lxc delete robotics

create alias:
lxc stop     software
lxc publish  software --alias\
 ubuntu/22.04:java:ssh

create alias from snapshot:
lxc snapshot software java:ssh
lxc publish  software/java:ssh --alias\
 ubuntu/22.04:java:ssh
lxc delete   software

launch alias:
lxc launch   ubuntu/22.04:java:ssh software &&
lxc exec     software bash
lxc stop     software && lxc delete software

Knowledge

ssh-keygen -b 4096 -t rsa -f ~/.ssh/id_rsa -q -N "hadoop@${HOSTNAME}" readlink -f /usr/bin/java \| sed "s:bin/java::" sudo apt-get install pdsh sudo apt-get install ssh

su -h hadoop hdfs namenode -format sudo -u haddop -H sh -c "whoami; echo ${HOME}" sh $HADOOP_HOME/sbin/start-dfs.sh http://127.0.0.1:9870 sh $HADOOP_HOME/sbin/start-yarn.sh http://127.0.0.1:8088

sudo apt dist-upgrade sudo do-release-upgrade sudo apt --fix-broken install sudo apt install ubuntu-desktop		[Service] User=hadoop Group=hadoop Type=forking SuccessExitStatus=143

if [ -f '/etc/os-release' ];then HOST_OS_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release \| tr -d '"') HOST_OS_ID_LIKE=$(grep -oP '(?<=^ID_LIKE=).+' /etc/os-release \| tr -d '"') HOST_OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release \| tr -d '"') fi

declare -a HADOOP_SCHEDULERS=(dev prod);\ declare -A MINMAX_CAPACITIES=([dev]='50 50' [prod]='50 70');\ for HADOOP_SCHEDULER in ${HADOOP_SCHEDULERS[@]};do \ for MINMAX_CAPACITY in ${MINMAX_CAPACITIES[${HADOOP_SCHEDULER}]};do \ echo "${HADOOP_SCHEDULER} => ${MINMAX_CAPACITY}";\ done;\ done

References

Hadoop » Install Standalone Mode on Ubuntu 20.04 Hadoop » Install & Configure on Ubuntu 20.04 Hadoop » Big Data Concepts & Terminology Hadoop » Setting up a Single Node Cluster Hadoop » Install as a Daemon Hadoop » Download » Archive Hadoop » Download » Current Hadoop » Download » Stable Hadoop » An Introduction Hadoop » Docs » Stable	Hadoop » `hadoop-mapreduce-client-core/mapred-default.xml` Hadoop » `hadoop-yarn-common/yarn-default.xml` Hadoop » Running Spark on Top of a Hadoop YARN Hadoop » `hadoop-hdfs/hdfs-default.xml` Hadoop » Java Versions VS Code on iPad Pro Machine Learning Jupyter Spark NLP	Hadoop » Firewall Transparency Hadoop » Administering HDFS Hadoop » Configure Ports Hadoop » Cluster Setup Hadoop » Java Versions Hadoop » Hortonworks Hadoop » Cloudera

Bash » Difference between `/opt` and `/usr/local` Bash » Switch user & execute remaining script Difference between sudo user vs. root user Create a Sudo User & Manage Access Bash » Non-interactive `ssh-keygen` Bash » Switch user & execute script Bash » Add a user if it doesn't exist Sed Replace A Multi-Line String SSH Connection Hang Forever Set Up Environment for Spark	AWS » Bastion Host SSH Tunneling Systemd » Safe Remove Services Installation of PySpark Bastion SSH Tunneling Linux User Creation Linux Containers

@@ Line 7: / Line 7: @@
 ==Configuration==
-<source lang="bash">
+<syntaxhighlight lang="bash">
 mkdir -p /home/hadoop/hdfs/{datanode,namenode}/
 sudo tee -a $HADOOP_HOME/etc/hadoop/core-site.xml >/dev/null <<EOF
@@ Line 22: / Line 22: @@
 </configuration>
 EOF
-</source>
+</syntaxhighlight>
 ----
-<source lang="bash">
+<syntaxhighlight lang="bash">
 mkdir -p /home/hadoop/hdfs/{datanode,namenode}
 sudo tee -a $HADOOP_HOME/etc/hadoop/hdfs-site.xml >/dev/null <<EOF
@@ Line 42: / Line 42: @@
 </configuration>
 EOF
-</source>
+</syntaxhighlight>
 ----
-<source lang="bash">
+<syntaxhighlight lang="bash">
 sudo tee -a $HADOOP_HOME/etc/hadoop/mapred-site.xml >/dev/null <<EOF
 <configuration>
@@ Line 53: / Line 53: @@
 </configuration>
 EOF
-</source>
+</syntaxhighlight>
 ----
-<source lang="bash">
+<syntaxhighlight lang="bash">
 sudo tee -a $HADOOP_HOME/etc/hadoop/yarn-site.xml >/dev/null <<EOF
 <configuration>
@@ Line 64: / Line 64: @@
 </configuration>
 EOF
-</source>
+</syntaxhighlight>
 ==Unit Testing==
 {|
 |valign="top"|
-<source lang='bash'>
+<syntaxhighlight lang='bash'>
 lxc launch images:debian/12 agronomy &&
 lxc exec agronomy bash <<'EOF'
@@ Line 80: / Line 80: @@
 java -version
 EOF
-</source>
+</syntaxhighlight>
 |valign="top"|
-<source lang='bash'>
+<syntaxhighlight lang='bash'>
 lxc launch images:fedora/37 robotics &&
 lxc exec robotics bash <<'EOF'
@@ Line 94: / Line 94: @@
 java -version
 EOF
-</source>
+</syntaxhighlight>
 |valign="top"|
-<source lang='bash'>
+<syntaxhighlight lang='bash'>
 lxc launch images:ubuntu/22.04 software &&
 lxc exec software bash <<'EOF'
@@ Line 108: / Line 108: @@
 java -version
 EOF
-</source>
+</syntaxhighlight>
 |-
@@ Line 202: / Line 202: @@
 |valign="top"|
-<source lang="ini">
+<syntaxhighlight lang="ini">
 [Service]
 User=hadoop
@@ Line 208: / Line 208: @@
 Type=forking
 SuccessExitStatus=143
-</source>
+</syntaxhighlight>
 |-
@@ Line 215: / Line 215: @@
 |-
 |valign="top" colspan="3"|
-<source lang="bash">
+<syntaxhighlight lang="bash">
 if [ -f '/etc/os-release' ];then
           HOST_OS_ID=$(grep -oP '(?<=^ID=).+'         /etc/os-release | tr -d '"')
@@ Line 221: / Line 221: @@
      HOST_OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
 fi
-</source>
+</syntaxhighlight>
 |-
@@ Line 228: / Line 228: @@
 |-
 |valign="top" colspan="3"|
-<source lang='bash'>
+<syntaxhighlight lang='bash'>
 declare -a HADOOP_SCHEDULERS=(dev prod);\
 declare -A MINMAX_CAPACITIES=([dev]='50 50' [prod]='50 70');\
@@ Line 236: / Line 236: @@
      done;\
 done
-</source>
+</syntaxhighlight>
 |}
@@ Line 295: / Line 295: @@
 * [https://www.datacamp.com/tutorial/installation-of-pyspark Installation of PySpark]
 * [[Bastion SSH Tunneling]]
+* [[Linux User Creation]]
 * [[Linux Containers]]

Hadoop: Difference between revisions

Latest revision as of 16:43, 12 December 2024

Contents

Configuration

Unit Testing

Knowledge

References

Navigation menu

Hadoop: Difference between revisions

Latest revision as of 16:43, 12 December 2024

Configuration

Unit Testing

Knowledge

References

Navigation menu

Search