Need Help For High Availability, IPfailover, Heartbe - Xpost
If someone could please help me with our attempt at a high availability cluster, I'd greatly appreciate it. We're trying to use a floating IP, Linode's IP failover, and heartbeat with pacemaker.
I've followed mostly this old guide which I've seen copied in various places:
The two servers are Smallpox and Chickenpox (primary). Below are our configuration settings.
# Smallpox - /etc/network/interfaces
# The loopback network interface
auto lo
iface lo inet loopback
# The primary network interface
auto eth0 eth0:0 eth0:1
iface eth0 inet static
address 66.228.44.168
netmask 255.255.255.0
gateway 66.228.44.1
iface eth0:0 inet static
address 192.168.174.209
netmask 255.255.128.0
# Floating IP
iface eth0:1 inet static
address 23.239.8.163
netmask 255.255.255.0
# Chickenpox - /etc/network/interfaces
# The loopback network interface
auto lo
iface lo inet loopback
# The primary network interface
auto eth0 eth0:0 eth0:1
iface eth0 inet static
address 23.239.8.83
netmask 255.255.255.0
gateway 23.239.8.1
# private ip
iface eth0:0 inet static
address 192.168.138.246
netmask 255.255.128.0
# Floating IP
iface eth0:1 inet static
address 23.239.8.163
netmask 255.255.255.0
The following is placed on both servers
# /etc/ha.d/ha.cf
autojoin none
logfacility syslog
# how many seconds between heartbeats
keepalive 2
# seconds until host is declared dead
deadtime 10
# how quickly to issue a "late heartbeat" warning
warntime 10
# grace period for declaring dead after heartbeat is first started
initdead 60
# which port to use for udp or ppp-udp
udpport 694
# Private IP addresses to communicate with
ucast eth0 192.168.138.246
ucast eth0 192.168.174.209
node chickenpox
node smallpox
# do not set primary back to original when possible
auto_failback off
# use pacemacker
crm respawn
This is our pacemaker configuration:
node $id="33dd51f3-4e89-4d70-a6e5-30a849ae68d2" chickenpox \
attributes standby="off"
node $id="887438f9-6991-4e15-a188-d5afaa4cc0d3" smallpox \
attributes standby="on"
primitive ip1 ocf:heartbeat:IPaddr2 \
params ip="23.239.8.163" nic="eth0" \
op monitor interval="5s"
primitive ip1arp ocf:heartbeat:SendArp \
params ip="23.239.8.163" nic="eth0"
group HAServices ip1 ip1arp \
meta target-role="Started"
order ip-before-arp inf: ip1:start ip1arp:start
property $id="cib-bootstrap-options" \
dc-version="1.1.6-9971ebba4494012a93c03b40a2c58ec0eb60f50c" \
cluster-infrastructure="Heartbeat" \
expected-quorum-votes="1" \
stonith-enabled="false" \
no-quorum-policy="ignore"
rsc_defaults $id="rsc-options" \
resource-stickiness="100"
Note that we are not using eth0:1 for our floating IP in the "nic" parameter as that was issuing warnings in the logs and the docs state to use eth0.
Here is some output:
smallpox$: ip addr sh eth0
3: eth0: <broadcast,multicast,up,lower_up>mtu 1500 qdisc pfifo_fast state UP qlen 1000
link/ether fe:fd:42:e4:2c:a8 brd ff:ff:ff:ff:ff:ff
inet 66.228.44.168/24 brd 66.228.44.255 scope global eth0
valid_lft forever preferred_lft forever
inet 192.168.174.209/17 brd 192.168.255.255 scope global eth0:0
valid_lft forever preferred_lft forever
inet 23.239.8.163/24 brd 23.239.8.255 scope global eth0:1
valid_lft forever preferred_lft forever
inet6 fe80::fcfd:42ff:fee4:2ca8/64 scope link
valid_lft forever preferred_lft forever
chickenpox$: ip addr sh eth0
3: eth0: <broadcast,multicast,up,lower_up>mtu 1500 qdisc pfifo_fast state UP qlen 1000
link/ether f2:3c:91:db:94:9d brd ff:ff:ff:ff:ff:ff
inet 23.239.8.83/24 brd 23.239.8.255 scope global eth0
valid_lft forever preferred_lft forever
inet 192.168.138.246/17 brd 192.168.255.255 scope global eth0:0
valid_lft forever preferred_lft forever
inet 23.239.8.163/32 brd 23.239.8.163 scope global eth0
valid_lft forever preferred_lft forever
inet 23.239.8.163/24 brd 23.239.8.255 scope global secondary eth0:1
valid_lft forever preferred_lft forever
inet6 2600:3c03::f03c:91ff:fedb:949d/64 scope global dynamic
valid_lft 2591976sec preferred_lft 604776sec
inet6 fe80::f03c:91ff:fedb:949d/64 scope link
valid_lft forever preferred_lft forever</broadcast,multicast,up,lower_up></broadcast,multicast,up,lower_up>
When we take the chickenpox server offline, we notice some chatter in the logs from heartbeat, but no noticeable errors. Unfortunately, the ip does not failover and we lose connectivity on the floating IP.
We have also tried adding the following to /etc/sysctl.conf
net.ipv4.ip_nonlocal_bind=1
net.ipv4.conf.all.promote_secondaries=1
I am by no means close to a networking expert so any help is truly appreciated.
Thanks!