#!/usr/bin/perl # # Passive fence agent - Checks NIC link state # # Digimer; digimer@digimer.ca # http://alteeve.ca/w # # This software is released under the GPL v2. See the LICENSE file in the # configuration directory for a copy of the GPL v2. # # Bugs; # - None known, many expected # # Warnings; # - This agent should be use *only* when a node has a single power supply fed # by a switched PDU. Further, it should be used as the lowest-priority method # after IPMI (or the like) and PDU based fence methods. # # PLEASE READ THE 'README.md' FOR FURTHER INFORMATION BEFORE USING THIS AGENT! # # Notes; # - Requires that the 'ethtool' package be installed. # # Play safe! use strict; use warnings; # IO::Handle is used for logging and communicating with the D-Link device # via the snmp command line tools. Net::SNMP is not used to do availability # issues with EL6. use IO::Handle; # Catch signals for clean exits. $SIG{INT} = \&_catch_sig; $SIG{TERM} = \&_catch_sig; # These are the default values and will be over-written by the config file's # variables which in turn can, in some cases, be over-written by command line # arguments. my $conf={ 'system' => { action => "off", agent_version => "1.0", debug => 0, device => "", got_cla => 0, # This is set if command line arguments are read. list => "", 'log' => "/var/log/fence_passive_nic.log", monitor => "", node_name => "", # My name, as reported by cman_tool nodes => 0, # Number of nodes in the cluster password => "", quiet => 0, version => 0, }, path => { ethtool => "/sbin/ethtool", ccs => "/usr/sbin/ccs", cman_tool => "/usr/sbin/cman_tool", }, }; # Log file for output. my $log = IO::Handle->new(); print "Opening: [$conf->{'system'}{'log'}] for logging.\n" if $conf->{'system'}{debug}; open ($log, ">>$conf->{'system'}{'log'}") || die "Failed to open: [$conf->{'system'}{'log'}] for writing; Error: $!\n"; # Set $log and STDOUT to hot (unbuffered) output. if (1) { select $log; $| = 1; select STDOUT; $| = 1; } # If this gets set in the next two function, the agent will exit. my $bad = 0; # Read in arguments from the command line. ($bad) = read_cla($conf, $log, $bad); # Now read in arguments from STDIN, which is how 'fenced' passes arguments. ($bad) = read_stdin($conf, $log, $bad); # If I've been asked to show the metadata XML, do so and then exit. if ($conf->{'system'}{action} eq "metadata") { metadata($conf, $log); do_exit($conf, $log, 0); } # If I've been asked to show the version information, do so and then exit. if ($conf->{'system'}{version}) { version($conf, $log); do_exit($conf, $log, 0); } # Start the logs. record($conf, $log, "-=] Called at: [".get_date_time($conf)."]\n", 2); # Make sure I was passed a network card name. if (not $conf->{'system'}{device}) { $bad = 1; record($conf, $log, "Called without a network device name! Please set: [node=\"{,}\"] in cluster.conf or use: [-n {,}] at the command line!\n", 1); } # Make sure there is at least one other fence agent configured before this one. ($bad) = check_for_other_agents($conf); # Make sure this cluster has only two nodes. if ($conf->{'system'}{nodes} != 2) { $bad = 1; record($conf, $log, "The 'fence_passive_nic' fence agent can only be used in two-node clusters!\n", 1); } # If anything went wrong, die now. if ($bad) { record($conf, $log, "Exiting on errors.\n", 1); do_exit($conf, $log, 1); } record($conf, $log, "Will check device: [$conf->{'system'}{device}]\n") if $conf->{'system'}{debug}; ############################################################################### # What do? # ############################################################################### # When asked to 'monitor' or 'list'. being multi-port, this will return a CSV # of ports and their current state. record($conf, $log, "Requested action: [$conf->{'system'}{action}].\n") if $conf->{'system'}{debug}; if (($conf->{'system'}{action} eq "monitor") or ($conf->{'system'}{action} eq "list")) { record($conf, $log, "Calling the 'show_list' function.\n") if $conf->{'system'}{debug}; show_list($conf, $log); do_exit($conf, $log, 0); } # Do it! my $exit_code = process_action($conf, $log); record($conf, $log, "Check complete, exiting.\n") if $conf->{'system'}{debug}; # Cleanup and exit. # print "Going into do_exit().\n"; do_exit($conf, $log, $exit_code); # print "Returned from do_exit().\n"; ############################################################################### # Here be functions. # ############################################################################### # Map fence device names to fence agents. sub map_fence_devices_to_agents { my ($conf) = @_; my $fh = IO::Handle->new(); my $sc = "$conf->{path}{ccs} -h localhost -p $conf->{'system'}{password} --lsfencedev"; record($conf, $log, "calling: [$sc]\n") if $conf->{'system'}{debug}; open ($fh, "$sc 2>&1 |") || die "Shell call: [$sc] failed; Error: $!\n"; while (<$fh>) { chomp; my $line = $_; record($conf, $log, "Output: [$line]\n") if $conf->{'system'}{debug}; if ($line =~ /^(.*?):.*?agent=(\S+)/) { my $device = $1; my $agent = $2; $conf->{fence_device}{$device} = $agent; record($conf, $log, "Found device: [$device] which uses agent: [$conf->{fence_device}{$device}]\n") if $conf->{'system'}{debug}; } } $fh->close; my $exit = $?; record($conf, $log, "Exit code: [$exit]\n") if $conf->{'system'}{debug}; return(0); } # Get the name of this node. sub get_local_node_name { my ($conf) = @_; my $fh = IO::Handle->new(); my $sc = "$conf->{path}{cman_tool} status"; record($conf, $log, "calling: [$sc]\n") if $conf->{'system'}{debug}; open ($fh, "$sc 2>&1 |") || die "Shell call: [$sc] failed; Error: $!\n"; while (<$fh>) { chomp; my $line = $_; record($conf, $log, "Output: [$line]\n") if $conf->{'system'}{debug}; if ($line =~ /Cannot open connection/i) { # The user must run this while the cluster is up. record($conf, $log, "The cluster is not running. Please start 'cman' and try again.\n", 1); do_exit($conf, $log, 9); } if ($line =~ /Node name: (.*)/) { $conf->{'system'}{node_name} = $1; record($conf, $log, "Found my name: [$conf->{'system'}{node_name}]\n") if $conf->{'system'}{debug}; } } $fh->close; my $exit = $?; record($conf, $log, "Exit code: [$exit]\n") if $conf->{'system'}{debug}; return(0); } # Get the peer's name sub get_peer_node_name { my ($conf) = @_; # This is used to make sure this is a two-node cluster. $conf->{'system'}{nodes} = 0; my $fh = IO::Handle->new(); my $sc = "$conf->{path}{cman_tool} nodes"; record($conf, $log, "calling: [$sc]\n") if $conf->{'system'}{debug}; open ($fh, "$sc 2>&1 |") || die "Shell call: [$sc] failed; Error: $!\n"; while (<$fh>) { chomp; my $line = $_; record($conf, $log, "Output: [$line]\n") if $conf->{'system'}{debug}; next if $line =~ /^Node/; # Skip the title line. my ($node) = ($line =~ /.*?\s(\S+)$/); if ($node ne $conf->{'system'}{node_name}) { $conf->{'system'}{peer_name} = $node; record($conf, $log, "Peer node: [$conf->{'system'}{peer_name}]\n") if $conf->{'system'}{debug}; } $conf->{'system'}{nodes}++; } $fh->close; my $exit = $?; record($conf, $log, "Exit code: [$exit]\n") if $conf->{'system'}{debug}; return(0); } # This uses 'ccs' to verify that there is at least one other agent configured # before fence_passive_nic. sub check_for_other_agents { my ($conf) = @_; record($conf, $log, "Checking that at least one other fence method is used before me.\n") if $conf->{'system'}{debug}; map_fence_devices_to_agents($conf); record($conf, $log, "Getting my name.\n") if $conf->{'system'}{debug}; get_local_node_name($conf); record($conf, $log, "I am: [$conf->{'system'}{node_name}]. Getting my peer's name.\n") if $conf->{'system'}{debug}; get_peer_node_name($conf); record($conf, $log, "My peer is: [$conf->{'system'}{peer_name}].\n") if $conf->{'system'}{debug}; my $bad = 1; my $victim = $conf->{'system'}{peer_name}; record($conf, $log, "Victime node: [$victim]\n") if $conf->{'system'}{debug}; my $parse = 0; my $fh = IO::Handle->new(); my $sc = "$conf->{path}{ccs} -h localhost -p $conf->{'system'}{password} --lsfenceinst"; record($conf, $log, "calling: [$sc]\n") if $conf->{'system'}{debug}; open ($fh, "$sc 2>&1 |") || die "Shell call: [$sc] failed; Error: $!\n"; while (<$fh>) { chomp; my $line = $_; record($conf, $log, "Output: [$line], victim: [$victim]\n") if $conf->{'system'}{debug}; if ($line =~ /$victim/) { $parse = 1; record($conf, $log, "Found the victim's fence devices.\n") if $conf->{'system'}{debug}; next; } if ($parse) { # A line that doesn't start with a space is a new node. if ($line =~ /^\S/) { $parse = 0; record($conf, $log, "Left the victim's fence devices.\n") if $conf->{'system'}{debug}; next; } if ($line =~ /(\S+):/) { my $device = $1; my $agent = $conf->{fence_device}{$device}; record($conf, $log, "Parsed device: [$device] which uses agent: [$agent].\n") if $conf->{'system'}{debug}; if ($agent) { # I found at least one other pre-defined agent. if ($agent ne "fence_passive_nic") { record($conf, $log, "The agent is different, we're good to go.\n") if $conf->{'system'}{debug}; $bad = 0; last; } else { record($conf, $log, "The agent is ours! This will not do.\n") if $conf->{'system'}{debug}; last; } } else { record($conf, $log, "No agent defined for this device!\n") if $conf->{'system'}{debug}; } } } } $fh->close; my $exit = $?; record($conf, $log, "Exit code: [$exit]\n") if $conf->{'system'}{debug}; if ($bad) { record($conf, $log, "Failed to find another fence device before this one!\n", 1); } return ($bad); } # This cleanly exits the agent. sub do_exit { my ($conf, $log, $exit_status) = @_; $exit_status = 9 if not defined $exit_status; # Close the log file handle, if it exists. print $log "\n"; $log->close() if $log; exit ($exit_status); } # This gets the state for the requested node and return the state as 'on' or # 'off'. sub get_state { my ($conf, $log) = @_; $conf->{'system'}{link_state} = ""; my $yes_count = 0; my $no_count = 0; my $seen = 0; # There may be one or more devices specified. foreach my $device (split/,/, $conf->{'system'}{device}) { $seen++; my $this_state = ""; my $fh = IO::Handle->new(); my $sc = "$conf->{path}{ethtool} $device"; record($conf, $log, "calling: [$sc]\n") if $conf->{'system'}{debug}; open ($fh, "$sc 2>&1 |") || die "Shell call: [$sc] failed; Error: $!\n"; while (<$fh>) { chomp; my $line = $_; record($conf, $log, "Output: [$line]\n") if $conf->{'system'}{debug}; if (lc($line) =~ /link detected: (.*)/) { $this_state = $1; record($conf, $log, "Read link state of: [$device] as: [$this_state]\n") if $conf->{'system'}{debug}; last; } } $fh->close; my $exit = $?; record($conf, $log, "Exit code: [$exit]\n") if $conf->{'system'}{debug}; if ($this_state eq "yes") { record($conf, $log, "The: [$device] shows a link.\n"); $yes_count++; } elsif ($this_state eq "no") { record($conf, $log, "The: [$device] does not show a link.\n"); $no_count++; } else { record($conf, $log, "WARNING: Unable to read the link state for: [$device].\n"); } } # If I see even one "yes", set the state to "yes" as the other node # has power. If all the states I read where 'no', then set the state # to "no". if ($yes_count > 0) { $conf->{'system'}{link_state} = "yes"; } elsif ($no_count == $seen) { $conf->{'system'}{link_state} = "no"; } # Return the hash reference. return ($conf->{'system'}{link_state}); } # This returns the 'help' message. sub help { my ($conf, $log) = @_; # Point the user at the man page. print "See 'man fence_passive_nic' for instructions on using the Passive NIC Fence Agent.\n"; do_exit($conf, $log, 0); } # This simply prints the 'metadata' XML data to STDOUT. sub metadata { my ($conf, $log) = @_; print q` This fence agent can not actually fence a node! This agent should never be used as the only fence device. It should only be used on cluster nodes with a single power supply. Further, you should have both IPMI (or the like) fencing and switched PDU fencing as higher priority fence methods in your cluster configuration. The goal of this agent is to allow the administrator to connect a dedicated network interface in a two-node cluster back-to-back and then check the link state to determine whether the other node is alive or dead. This works because a crashed or hung node will continue to provide a link to the peer node. Thus, if the link is down, we can somewhat safely assume that the peer has lost power. If the node has lost power, we can safely return a "fence success" exit code, allowing recovery to begin. The network interface name to check for a link status. This is the password needed to authenticate against the ricci user. Action (operation) to take; off, on, reboot, status, monitor, list, metadata. Only 'off', 'on' and 'status' make sense, and they only check the current state, not set it. The 'reboot' action is treated as an 'off' action. (default) Supress all output to STDOUT, including critical messages. Check logfile if used. Default 1 (quiet). Print extensive debug information to STDOUT and to the log file. Prints the fence agent version and exits. `; # Done, exit. do_exit($conf, $log, 0); } # This handles the actual actions. sub process_action { my ($conf, $log) = @_; record($conf, $log, "In the 'process_action' function.\n") if $conf->{'system'}{debug}; my $exit_code = 0; # Make this more readable. my $action = $conf->{'system'}{action}; my $node = $conf->{'system'}{device}; record($conf, $log, "action: [$action], device: [$node]\n") if $conf->{'system'}{debug}; # The following actions require a port. Error if I don't have one. if (not $conf->{'system'}{device}) { # These are the incompatible calls. if (($action eq "on") || ($action eq "off") || ($action eq "reboot") || ($action eq "status")) { record($conf, $log, "\nERROR! Action request: [$action] requires a ethernet device name!\n", 1) if $conf->{'system'}{debug}; do_exit($conf, $log, 1); } } # Make sure my call order is clear. if ($action eq "on") { # Unfencing the node. if (get_state($conf, $log, $conf->{'system'}{device}) eq "yes") { if ($conf->{'system'}{device} =~ /,/) { record($conf, $log, "SUCCESS: At least one of the: [$conf->{'system'}{device}] devices shows a link.\n"); } else { record($conf, $log, "SUCCESS: Device: [$conf->{'system'}{device}] shows a link.\n"); } do_exit($conf, $log, 0); } else { if ($conf->{'system'}{device} =~ /,/) { record($conf, $log, "FAILED: The: [$conf->{'system'}{device}] devices show no link.\n"); } else { record($conf, $log, "FAILED: Device: [$conf->{'system'}{device}] shows no link.\n"); } do_exit($conf, $log, 1); } } elsif (($action eq "off") || ($action eq "reboot")) { # Fencing the node. I do 'reboot' here because all 'reboot' # needs for success is 'off'. if (get_state($conf, $log, $conf->{'system'}{device}) eq "no") { if ($conf->{'system'}{device} =~ /,/) { record($conf, $log, "SUCCESS: The: [$conf->{'system'}{device}] devices show no link.\n"); } else { record($conf, $log, "SUCCESS: Device: [$conf->{'system'}{device}] shows no link.\n"); } do_exit($conf, $log, 0); } else { if ($conf->{'system'}{device} =~ /,/) { record($conf, $log, "FAILED: At least one of the: [$conf->{'system'}{device}] devices shows a link.\n"); } else { record($conf, $log, "FAILED: Device: [$conf->{'system'}{device}] shows a link.\n"); } do_exit($conf, $log, 1); } } elsif ($action eq "status") { # This needs to return; # 0 = Link is up. # 1 = device not found. # 2 = Link is down my $state = get_state($conf, $log, $conf->{'system'}{device}); if ($state eq "yes") { if ($conf->{'system'}{device} =~ /,/) { record($conf, $log, "Status: At least one link is up\n"); } else { record($conf, $log, "Status: Link is up\n"); } do_exit($conf, $log, 0); } elsif ($state eq "no") { if ($conf->{'system'}{device} =~ /,/) { record($conf, $log, "Status: Links are down\n"); } else { record($conf, $log, "Status: Link is down\n"); } do_exit($conf, $log, 2); } else { if ($conf->{'system'}{device} =~ /,/) { record($conf, $log, "Status: At least one link state is unknown\n"); } else { record($conf, $log, "Status: Link state unknown\n"); } do_exit($conf, $log, 1); } } else { record($conf, $log, "ERROR: Unknown action request: [$action]!\n", 1); do_exit($conf, $log, 1); } return ($exit_code); } # Read in command line arguments sub read_cla { my ($conf, $log, $bad) = @_; # Loop through the passed arguments, if any. record($conf, $log, "Got args:\n") if $conf->{'system'}{debug}; my $set_next=""; foreach my $arg (@ARGV) { record($conf, $log, "[$arg]\n") if $conf->{'system'}{debug}; $conf->{'system'}{got_cla}=1; # If 'set_next' has a value, push this argument into the 'conf' # hash. if ($set_next) { # Record the values. $conf->{'system'}{$set_next}=$arg; record($conf, $log, "Setting: 'system::$set_next': [$conf->{'system'}{$set_next}]\n") if $conf->{'system'}{debug}; # Clear it now for the next go-round. $set_next=""; next; } if ($arg =~ /-h/) { # Print the help message and then exit. help($conf, $log); } elsif ($arg =~ /--version/) { # Print the version information and then exit. $conf->{'system'}{version} = 1; record($conf, $log, "Setting version\n") if $conf->{'system'}{debug}; } elsif ($arg =~ /-q/) { # Suppress all messages, including critical messages, from STDOUT. $conf->{'system'}{quiet} = 1; } elsif ($arg =~ /-d/) { # Enable debug mode. $conf->{'system'}{debug} = 1; } elsif ($arg =~ /^-/) { $arg =~ s/^-//; if ($arg eq "n") { # This is the node to work on. $set_next = "device"; #record ($conf, $log, "Next argument will be stored in: [$set_next]\n") if $conf->{'system'}{debug}; } elsif ($arg eq "o") { # This is the action to take. $set_next = "action"; #record ($conf, $log, "Next argument will be stored in: [$set_next]\n") if $conf->{'system'}{debug}; } elsif ($arg eq "p") { # This sets the password to use for ricci. $set_next = "password"; #record ($conf, $log, "Next argument will be stored in: [$set_next]\n") if $conf->{'system'}{debug}; } } else { # Bad argument. record($conf, $log, "\nERROR: Argument: [$arg] is not valid!\n"); record($conf, $log, "ERROR: Please run: [man fence_passive_nic] to see a list of valid arguments.\n\n"); $bad = 1; } } } # Read arguments from STDIN. This is adapted from the 'fence_brocade' agent. sub read_stdin { my ($conf, $log, $bad) = @_; return (0) if $conf->{'system'}{got_cla}; my $option; my $line_count = 0; while(defined (my $option=<>)) { # Get rid of newlines. chomp $option; # Record the line for now, but comment this out before release. record($conf, $log, "Processing option line: [$option]\n", 2); # strip leading and trailing whitespace $option =~ s/^\s*//; $option =~ s/\s*$//; # skip comments next if ($option=~ /^#/); # Increment my option line count. $line_count++; # Go to the next line if the option line is empty. next if not $option; # Split the option up into the name and the value. my ($name, $value) = split /\s*=\s*/, $option; # Record the line for now, but comment this out before release. #record ($conf, $log, "Name: [$name], value: [$value].\n") if $conf->{'system'}{debug}; record ($conf, $log, "Name: [$name], value: [$value].\n"); # Set my variables depending on the veriable name. if ($name eq "agent") { # This is only used by 'fenced', but I record it for # potential debugging. $conf->{'system'}{agent} = $value; } elsif ($name eq "action") { # 'option' is deprecated. $conf->{'system'}{action} = $value; } elsif ($name eq "port") { # This sets the port number to act on. $conf->{'system'}{device} = $value; } elsif ($name eq "passwd") { # This sets the password to use for ricci. $conf->{'system'}{password} = $value; } elsif ($name eq "nodename") { # This is passed by 'fenced' via 'cluster.conf' as of # cluster version 3, but it's not yet documented. $conf->{'system'}{nodename} = $value; } else { record($conf, $log, "\nERROR: Illegal name in option: [$option] at line: [$line_count]\n\n", 1); # 'rohara' from #linux-cluster suggested it's better to # simply ignore unknown input, as that is the behaviour # the fenced authors expect. #$bad=1; } } return ($bad); } # This function simply prints messages to both the log and to stdout. sub record { my ($conf, $log, $msg, $critical) = @_; $critical = 0 if not $critical; # The log file gets everything. if ($critical == 2) { print $log $msg; } print $msg if ((not $conf->{'system'}{quiet}) && ($critical != 2)); # Critical messages always print. print $msg if (($critical) && ($conf->{'system'}{quiet})); return(0); } # When asked to 'monitor' or 'list', show a CSV of all nodes and their aliases, # when found in the config file. sub show_list { my ($conf, $log) = @_; record($conf, $log, "In 'show_list' function.\n") if $conf->{'system'}{debug}; record($conf, $log, "This is not a multi-port device. The 'list' action is useless.\n") if $conf->{'system'}{debug}; do_exit($conf, $log, 0); } # This prints the version information of this fence agent and of any configured # fence devices. sub version { my ($conf, $log) = @_; # Print the Fence Agent version first. record ($conf, $log, "Fence Agent ver. $conf->{'system'}{agent_version}\n", 1); do_exit($conf, $log, 0); } # This returns the current date and time. sub get_date_time { my ($conf) = @_; # Get the current date and time, my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time); # Format it to 'YYYY-MM-DD HH:MM:SS'. my $now=(1900+$year)."-".sprintf("%02d", ($mon+1))."-".sprintf("%02d", $mday)." ".sprintf("%02d", $hour).":".sprintf("%02d", $min).":".sprintf("%02d", $sec); return($now); } # Catch SIG, move zig! sub _catch_sig { my $signame = shift; record($conf, $log, "fence_passive_nic process with PID $$ Exiting on SIG${signame}.\n", 1); do_exit($conf, $log, 1); }