There has been a recent discussion over at Usenet newsgroup comp.os.vms concerning backup strategies and suchlike. One of the obvious solutions being proposed is a combination of host and controller based disk mirroring, including splitting and recombining disks to retain redundancy at all times.
One poster commented that "it sometimes works too well i.e. a single drive fails in a set and no one notices it. Then a few months later, a second drive fails and then that's it".
This has me worried, so to ensure people notice when a shadow set is reduced, I present the following command procedure:
$! See bottom of command procedure for comments
$ set on
$ on warning then goto error
$ say := write sys$output
$
$ boottime = f$cvtime (f$getsyi ("boottime"))
$ if f$getsyi ("cluster_member")
$ then
$cloop:
$ csid = f$csid (context)
$ if csid .eqs. ""
$ then
$ goto end_cloop
$ endif
$ bt = f$cvtime (f$getsyi ("boottime",,csid))
$ if bt .lts. boottime
$ then
$ boottime = bt
$ endif
$ goto cloop
$end_cloop:
$ endif
$
$ database = "sys$common:[sysmgr]shadow.data"
$ node = f$fao ("!6AS", f$edit (f$getsyi ("nodename"), "collapse,upcase"))
$
$ say "SHADOW WATCH executing on node " + node
$
$ disk_offset = 0
$ disk_length = 8
$ date_offset = 8
$ date_length = 22
$ discover_node_offset = 8
$ discover_node_length = 6
$ member_cnt_offset = 14
$ member_cnt_length = 1
$ member_offset = 15
$ rec_length = 128
$
$ if f$search (database) .nes. ""
$ then
$ new = "FALSE"
$ open/read/write/share/error=error info 'database'
$ read/end=error/error=error/key="BOOTTIME"/index=0/nolock info record
$ if f$extract (disk_offset, disk_length, record) .nes. "BOOTTIME"
$ then
$ say "Sanity check failed"
$ goto error
$ endif
$ date = f$extract (date_offset, date_length, record)
$ if date .lts. boottime
$ then
$ say "SHADOW WATCH database older than boot time - creating..."
$ new = "TRUE"
$ close info
$ endif
$ else
$ say "SHADOW WATCH database not found - creating..."
$ new = "TRUE"
$ endif
$ if new
$ then
$ create/fdl=sys$input 'database'
$ deck
FILE
ORGANIZATION indexed
RECORD
CARRIAGE_CONTROL carriage_return
FORMAT fixed
SIZE 128
AREA 0
ALLOCATION 50
BEST_TRY_CONTIGUOUS yes
BUCKET_SIZE 3
EXTENSION 50
AREA 1
ALLOCATION 50
BEST_TRY_CONTIGUOUS yes
BUCKET_SIZE 3
EXTENSION 50
KEY 0
CHANGES no
DATA_AREA 0
DATA_FILL 85
DATA_KEY_COMPRESSION yes
DATA_RECORD_COMPRESSION yes
DUPLICATES no
INDEX_AREA 1
INDEX_COMPRESSION no
INDEX_FILL 85
LEVEL1_INDEX_AREA 1
PROLOG 3
SEG0_LENGTH 8
SEG0_POSITION 0
TYPE string
$ eod
$ open/read/write/share/error=error info 'database'
$ record = "BOOTTIME" + boottime
$ write/error=error info f$fao ("!128AS", record)
$ gosub discover_disks
$ else
$ gosub check_disks
$ gosub discover_disks
$ endif
$ goto exit
$error:
$ problem = "SHADOW_WATCH terminated unexpectedly on " + node + -
"with status " + $STATUS
$ gosub send_mail
$exit:
$ if f$trnlnm("info") .nes. ""
$ then
$ close info
$ endif
$ exit
$
$!******************************************************************************
$discover_disks:
$ disk = f$device_scan ("_DSA*")
$ if disk .eqs. ""
$ then
$ goto end_discover_disks
$ endif
$ disk = disk - "_" - ":"
$ if .not. f$getdvi (disk, "mnt")
$ then
$ goto discover_disks
$ endif
$ record = f$fao ("!8AS", disk)
$ member_count = 0
$ temp = disk
$disk_loop:
$ temp = f$getdvi (temp,"shdw_next_mbr_name")
$ if temp .eqs. ""
$ then
$ goto end_disk_loop
$ endif
$ if member_count .eq. 0
$ then
$ member_list = temp
$ else
$ member_list = member_list + "," + temp
$ endif
$ member_count = member_count + 1
$ goto disk_loop
$end_disk_loop:
$ record = record + node + f$string (member_count) + member_list
$ disk = f$fao ("!8AS", disk)
$ read/key="''disk'"/index=0/error=write_it/nolock info temp
$ goto next_disk
$write_it:
$ say "SHADOW WATCH discovered disk " + disk
$ write/error=error info f$fao ("!128AS", record)
$next_disk:
$ goto discover_disks
$end_discover_disks:
$ return
$!******************************************************************************
$check_disks:
$ read/end=end_check_disks/error=error/nolock info record
$ disk = f$edit (f$extract (disk_offset, disk_length, record), "collapse")
$ discover_node = f$extract (discover_node_offset, -
discover_node_length, -
record)
$ rec_member_cnt = f$integer (f$extract (member_cnt_offset, -
member_cnt_length, -
record))
$ member_list = f$edit (f$extract (member_offset, -
f$length(record)-member_offset, -
record), "collapse")
$ if discover_node .nes. node
$ then
$ goto check_disks
$ endif
$ if .not. f$getdvi (disk, "exists")
$ then
$ read/delete/key="''disk'"/index=0 info record
$ goto check_disks
$ endif
$ if .not. f$getdvi (disk, "mnt")
$ then
$ goto check_disks
$ endif
$ say "SHADOW WATCH checking disk " + disk
$ temp = disk
$ member_count = 0
$check_loop:
$ temp = f$getdvi (temp,"shdw_next_mbr_name")
$ if temp .eqs. ""
$ then
$ goto end_check_loop
$ endif
$ if member_count .eq. 0
$ then
$ test_list = temp
$ else
$ test_list = test_list + "," + temp
$ endif
$ member_count = member_count + 1
$ goto check_loop
$end_check_loop:
$ if member_count .lt. rec_member_cnt
$ then
$ member_count = 0
$ lost_member = ""
$get_lost_member:
$ member = f$element (member_count, ",", member_list)
$ if member .eqs. ","
$ then
$ goto end_get_lost_member
$ endif
$ if f$locate (member, test_list) .eq. f$length (test_list)
$ then
$ if lost_member .eqs. ""
$ then
$ lost_member = member
$ else
$ lost_member = lost_member + "," + member
$ endif
$ endif
$ member_count = member_count + 1
$ goto get_lost_member
$end_get_lost_member:
$ say "SHADOW WATCH lost a member of " + disk
$ problem = "SHADOW_WATCH lost member " + lost_member + " of " + -
disk + " on " + node
$ gosub send_mail
$ endif
$ if member_count .ne. rec_member_cnt .or. -
test_list .nes. member_list
$ then
$ say "SHADOW WATCH updating member list for " + disk
$ say "Old: " + member_list
$ say "New: " + test_list
$ retry_count = 0
$update_record:
$ read/key="''disk'"/index=0/error=locked info temp
$ record = f$fao ("!8AS", disk) + node + f$string (member_count) + test_list
$ write/update/error=error info f$fao ("!128AS", record)
$ goto end_update
$locked:
$ if $status .eqs. "%X000182AA"
$ then
$ wait 0:0:2
$ retry_count = retry_count + 1
$ if retry_count .gt. 5
$ then
$ say "Retry count exhausted trying to update record " + disk
$ goto error
$ endif
$ goto update_record
$ else
$ goto error
$ endif
$end_update:
$ endif
$ goto check_disks
$end_check_disks:
$ return
$!******************************************************************************
$send_mail:
$ mail/subject="''problem'" nl: system
$ return
$
$!++
$!
$! PROCEDURE:
$!
$! SHADOW_WATCH.COM
$!
$! DESCRIPTION:
$!
$! This command procedure is designed to be run via a scheduling system
$! on a regular basis (for example, every 15 or 30 minutes).
$!
$! The command procedure takes a snapshot of all host based shadow
$! sets "soon" after system startup (where soon is defined as "as soon
$! as the scheduling system starts up"). On subsequent runs, the
$! procedure searchs the system for shadow sets that have had their
$! membership count fall, and reports these disks to the system username
$! via e-mail.
$!
$! In addition, it discovers members that have been added to shadow sets
$! it already knows about, and it also discovers new shadow sets to check
$! for subsequent runs.
$!
$! If shadow sets are not mounted, it ignores them.
$!
$! AUTHOR:
$!
$! James F. Duff
$! 15-Apr-1998
$!
$! MODIFICATION HISTORY
$!
$! 15-Apr-1998 X01-00 Jim Duff
$! Original version of procedure
$!
$! 16-Jun-1998 X01-01 Jim Duff
$! In subroutine check_disks, ensure the disk exists. If it doesn't,
$! remove it from the datafile.
$!
$! 30-Sep-1998 X01-02 Jim Duff
$! Add /KEY and /INDEX qualifiers to the READ/DELETE statement in
$! subroutine check_disks
$!
$! 21-Oct-1999 X01-03 Jim Duff
$! Make the check_disks routine report the name of the physical disk
$! that dropped from the shadow set.
$!
$! 17-Dec-2007 X01-04 Jim Duff
$! Clean up for publication.
$!
$!--
[quote] "it sometimes works too well i.e. a single drive fails in a set and no one notices it. Then a few months later, a second drive fails and then that's it".
[/quote]
The crux: "and no one notices it".
Who is managuing this system? IMHO logfiles need to be checked - as well as the hardware. If no one notices, system management has no meaning at all. Even with a procedure noticing something is wrong. That will rendered "unnoticed". Outputting mail is one thing - if the procedure runs without error. It will be unnoticed if an error occurs and no mail is sent, or mail fails.
Posted by: SYSMGR at December 19, 2007 12:24 AM
Hi,
one line is wrapped around:
record=f$fao("!8AS",disk)+node+f$string(member_count) + tes
t_list
If you correct this you'll get a nice tool.
thanx
Eberhard
Posted by: Eberhard Heuser at December 19, 2007 2:33 AM
SYSMGR: I agree with your points (and note that the quote was not mine). This is only part of a monitoring system.
As to running without error, the procedure outputs mail on error as well. It also outputs comprehensive information as to what it is noticing. If mail fails, bounces get sent to appropriate people so that action can be taken (in any properly configured and tested e-mail system).
All that being said, this procedure ran at a site with 1000+ shadow sets, and performed as expected.
Eberhard: Thanks! That line has been corrected.
Posted by: Jim Duff at December 19, 2007 8:53 AM
Comments are closed