distributed-system-analysis
diff --git a/‎agent/util-scripts/pbench-move-results‎
Lines changed: 67 additions & 88 deletions b/‎agent/util-scripts/pbench-move-results‎
Lines changed: 67 additions & 88 deletions
diff --git a/‎server/pbench/bin/pbench-base.sh‎
Lines changed: 25 additions & 24 deletions b/‎server/pbench/bin/pbench-base.sh‎
Lines changed: 25 additions & 24 deletions
diff --git a/‎server/pbench/bin/pbench-dispatch‎
Lines changed: 5 additions & 5 deletions b/‎server/pbench/bin/pbench-dispatch‎
Lines changed: 5 additions & 5 deletions
@@ -8,13 +8,15 @@ pbench_bin="`cd ${script_path}/..; /bin/pwd`"
 # source the base script
 . "$pbench_bin"/base
 
+controller=$hostname
+
 function usage() {
     printf "usage:\n"
     printf "$script_name [--prefix=<path>] [--xz-single-threaded] [--show-server]\n"
 }
 
 # Process options and arguments
-opts=$(getopt -q -o p:xS --longoptions "prefix:,xz-single-threaded,show-server" -n "getopt.sh" -- "$@");
+opts=$(getopt -q -o u:p:xS --longoptions "user:,prefix:,xz-single-threaded,show-server" -n "getopt.sh" -- "$@");
 if [ $? -ne 0 ]; then
     printf "\n"
     printf "%s\n" $*
@@ -24,11 +26,20 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 
+user=${PBENCH_USER}
+prefix=
 xz_single_threaded=
 show_server=
 eval set -- "$opts";
 while true; do
     case "$1" in
+	-u|--user)
+	    shift;
+            if [ -n "$1" ]; then
+                user="$1"
+                shift;
+            fi
+            ;;
         -p|--prefix)
             shift;
             if [ -n "$1" ]; then
@@ -43,7 +54,7 @@ while true; do
 	-S|--show-server)
 	    shift;
 	    show_server=1
-	    ;;	
+	    ;;
         --)
             shift;
             break;
@@ -61,6 +72,7 @@ if [ ! -f "$pbench_bin/id_rsa" ]; then
 	exit 1
 fi
 
+# ask the server where to send the tarballs
 results_webserver=$(getconf.py webserver results)
 if [ -z "$results_webserver" ]; then
     error_log "ERROR: No web server host configured from which we can fetch the FQDN of the host to which we copy/move results"
@@ -123,7 +135,6 @@ if [ -z "$results_path_prefix" ]; then
     debug_log "expected the results_host_info to have the form: <results_user>@<results_host(FQDN)>:<results_path_prefix>"
     exit 1
 fi
-results_full_path="$results_path_prefix/$hostname"
 
 if [[ ! -z "$show_server" ]] ;then
     echo ${results_repo}
@@ -137,17 +148,20 @@ if [ $? -ne 0 ]; then
     debug_log "the following ssh command failed: \"ssh -q -i $pbench_bin/id_rsa $ssh_opts $results_repo exit\""
     exit 1
 fi
-ssh -i $pbench_bin/id_rsa $ssh_opts $results_repo "mkdir -p $results_full_path"
-if [ $? -ne 0 ]; then
-    error_log "ERROR: unable to create remote results path, $results_repo:$results_full_path"
-    exit 1
-fi
 
 let runs_copied=0
 let failures=0
 
-trap "rm -f $pbench_tmp/prefix.*" EXIT INT QUIT
+tmp=${pbench_tmp}/${script_name}.$$
+trap "rm -rf $tmp" EXIT INT QUIT
 
+mkdir -p $tmp/$controller
+sts=$?
+if [ $sts -ne 0 ] ;then
+    error_log "Failed: \"mkdir -p $tmp/$controller\", status $sts"
+    exit 1
+fi
+# We can now start copying tarballs to the server
 
 # Move into pbench run collection directory
 pushd $pbench_run >/dev/null
@@ -172,120 +186,85 @@ for dir in `/bin/ls -ort -d */ | awk '{print $8}' | grep -v "^tools-" | grep -v
         /bin/cp pbench.log $pbench_run_name/
     fi
 
+    # if -u was specified, store the specified user in metadata.log
+    if [ ! -z $user ] ;then
+	mdlog=${pbench_run_name}/metadata.log
+	echo $user | pbench-add-metalog-option ${mdlog} run user
+    fi
+
+    # if -p was specified, store the specified prefix in metadata.log
+    if [ ! -z $prefix ] ;then
+	mdlog=${pbench_run_name}/metadata.log
+	echo $prefix | pbench-add-metalog-option ${mdlog} run prefix
+    fi
+
     results_size=`du -sm $pbench_run_name | awk '{print $1}'`
     debug_log "preparing to copy $results_size MB of data from $pbench_run/$pbench_run_name"
 
-    tarball="$pbench_run_name.tar.xz"
+    # Create a temp directory $tmp/$controller to contain the tarball
+    # and the md5 file (as ${tb}.tar.xz.md5.check). Copy the directory
+    # with scp -r $tmp/$controller $remote: that will create the
+    # $controller subdirectory on the remote (if necessary) OR fail.
+
+    # If it does not fail, then check the MD5 sum and rename the foo.tar.xz.md5.check file
+    # to foo.tar.xz.md5. That's the signal that the agent has finished with this tarball.
+
+    tarball="$tmp/$controller/$pbench_run_name.tar.xz"
     if [[ ${xz_single_threaded} != "1" ]] ;then
 	echo "tar --create --force-local \"$pbench_run_name\" | xz -T0 > \"$tarball\" "
 	tar --create --force-local "$pbench_run_name" | xz -T0 > "$tarball"
     else
 	echo "tar --create --xz --force-local --file=\"$tarball\" \"$pbench_run_name\" "
-	tar --create --xz --force-local --file="$tarball" "$pbench_run_name" 
+	tar --create --xz --force-local --file="$tarball" "$pbench_run_name"
     fi
-    
+
     if [ $? -ne 0 ]; then
         error_log "ERROR: tar failed for $pbench_run/$pbench_run_name, skipping"
         rm -f "$tarball"
         let failures=failures+1
         continue
     fi
-    md5sum "$tarball" > "$tarball.md5"
+
+    tarballmd5="$tarball.md5.check"
+    # we need to calculate the md5 sum in the temp directory
+    # in order to get the filename right.
+    pushd $(dirname $tarball) > /dev/null
+    md5sum "$(basename $tarball)" > "$tarballmd5"
     if [ $? -ne 0 ]; then
         error_log "ERROR: md5sum failed for $tarball, skipping"
-        rm -f $tarball $tarball.md5
+        rm -f "$tarball" "$tarballmd5"
         let failures=failures+1
+	popd >/dev/null
         continue
     fi
-
-    # Perform the actual copy
-    # if a prefix is provided, copy it to the other side - maybe this should be part of the tarball?
-    prefixfile=""
-    if [ ! -z "$prefix" ] ;then
-        prefixfile=$pbench_tmp/prefix.$pbench_run_name
-        echo "$prefix" > $prefixfile
-    fi
-
-    ssh -i $pbench_bin/id_rsa $ssh_opts $results_repo "mkdir -p $results_full_path"
-
-    # check if there is a name collision and resolve it
-    typeset -i i=1
-    rtarball=$tarball
-    rprefixfile=$prefixfile
-    while (( 1 )) ; do
-        ssh -i $pbench_bin/id_rsa $ssh_opts $results_repo "test -f $results_full_path/$rtarball"
-        if [ $? -eq 0 ] ;then
-            # collision - warn the first time around
-            if [ $i -eq 1 ] ;then
-                log "WARNING: name collision - $results_repo:$results_full_path/$rtarball exists"
-            fi
-        else
-            # collision resolution found
-            if [ "$tarball" != "$rtarball" ] ;then
-                mv $tarball $rtarball
-                mv $tarball.md5 $rtarball.md5
-                tarball=$rtarball
-                if [ ! -z "$prefixfile" ]; then
-                    mv $prefixfile $rprefixfile
-                    prefixfile=$rprefixfile
-                fi
-            fi
-            break
-        fi
-        rtarball="DUPLICATE__NAME.$i.$pbench_run_name.tar.xz"
-        if [ ! -z "$prefixfile" ] ;then
-            rprefixfile="$prefixfile.$i"
-        fi
-        i=$i+1
-    done
-
-    # FIXME: don't assume final path contains /incoming in it
-    if [[ $i -eq 1 ]]; then
-        if [[ ! -z "$prefix" ]]; then
-            debug_log "copying $tarball to http://$results_webserver/results/$hostname/$prefix/..."
-        else
-            debug_log "copying $tarball to http://$results_webserver/results/$hostname/..."
-        fi
-    else
-        debug_log "archiving $tarball to $results_webserver, but not being made available via the web"
-    fi
+    popd >/dev/null
 
     # finally do the copy
-    scp $scp_opts -i $pbench_bin/id_rsa $ssh_opts ./$tarball ./$tarball.md5 $prefixfile $results_repo:$results_full_path
+    scp -r $scp_opts -i $pbench_bin/id_rsa $ssh_opts $tmp/$controller $results_repo:$results_path_prefix
     if [ $? -ne 0 ]; then
-        error_log "ERROR: unable to copy results tarball, $tarball, to $results_repo:$results_full_path"
-        rm -f $tarball $tarball.md5
+        error_log "ERROR: unable to copy results tarball, $tarball, to $results_repo:$results_path_prefix"
+        rm -f $tarball $tarballmd5
         let failures=failures+1
         continue
     fi
 
-    # clean up the prefix file (if present)
-    if [ ! -z "$prefixfile" -a -f "$prefixfile" ] ;then
-        rm -f $prefixfile
-    fi
-
     # Verify the bits copied are good
-    ssh -i $pbench_bin/id_rsa $ssh_opts $results_repo "cd $results_full_path; md5sum --check $pbench_run_name.tar.xz.md5"
+    md5name=$(basename $tarball).md5
+    ssh -i $pbench_bin/id_rsa $ssh_opts $results_repo "cd $results_path_prefix/$controller; md5sum --check ${md5name}.check && mv ${md5name}.check ${md5name}"
     chk_res=$?
-    rm -f $tarball $tarball.md5
     if [ $chk_res -ne 0 ]; then
         error_log "ERROR: remote copy failed, remote tarball MD5 does not match original"
-        rm -f $tarball $tarball.md5
+        rm -f $tarball $tarballmd5
         let failures=failures+1
         continue
-    else
-        if [ "$script_name" == "pbench-move-results" ]; then
-            rm -rf $pbench_run_name
-        else
-            touch $pbench_run_name.copied
-        fi
     fi
+    rm -f $tarball $tarballmd5
 
-    # set the state of the result appropriately so that it will be processed
-    # by the server scripts.
-    ssh -i $pbench_bin/id_rsa $ssh_opts $results_repo \
-	/opt/pbench-server/bin/pbench-server-set-result-state $results_full_path $tarball
-
+    if [ "$script_name" == "pbench-move-results" ]; then
+        rm -rf $pbench_run_name
+    else
+        touch $pbench_run_name.copied
+    fi
     let runs_copied=runs_copied+1
 done
 
 
@@ -42,11 +42,10 @@ else
 fi
 
 ARCHIVE=${TOP}/archive/fs-version-001
-INOTIFY_STATE_DIR=${ARCHIVE}/inotify_state
 INCOMING=${TOP}/public_html/incoming
 # this is where the symlink forest is going to go
 RESULTS=${TOP}/public_html/results
-
+USERS=${TOP}/public_html/users
 
 if [[ -z "$_PBENCH_SERVER_TEST" ]]; then
     function timestamp {
@@ -125,27 +124,29 @@ function log_finish {
     exec 4>&-    # Close error file
 }
 
-# The inotify script runs the server scripts like dispatch 
-# and unpack asynchronously (more will be added in future), 
-# results in multiple instances of those scripts running in 
-# parallel. If every instance tries to write in the same file 
-# then it will be chaos and make things difficult to debug. 
-# In that case, this function will acquire a lock on the main
-# log file and allow every instance to append the log saved in 
-# the /tmp directory (with different PID) to the main log file.
-
-function log_append {
-    #log_append $TMP/$(basename $0).$$ $LOGSDIR/$(basename $0)
-    TMP_DIR=$1
-    LOG_DIR=$2
-    mkdir -p $LOG_DIR
-    if [[ $? -ne 0 || ! -d "$LOG_DIR" ]]; then
-        doexit "Unable to find/create logging directory, $LOG_DIR"
+# Function used by the shims to quarantine problematic tarballs.  It
+# is assumed that the function is called within a log_init/log_finish
+# context.  Errors here are fatal but we log an error message to help
+# diagnose problems.
+function quarantine () {
+    dest=$1
+    shift
+    files="$@"
+
+    mkdir -p $dest
+    sts=$?
+    if [ $sts -ne 0 ] ;then
+        # log error
+        echo "$TS: quarantine $dest $files: \"mkdir -p $dest\" failed with status $sts" >&4
+        log_finish
+        exit 101
+    fi
+    mv $files $dest
+    sts=$?
+    if [ $sts -ne 0 ] ;then
+        # log error
+        echo "$TS: quarantine $dest $files: \"mv $files $dest\" failed with status $sts" >&4
+        log_finish
+        exit 102
     fi
-
-    log_file=$LOG_DIR/$(basename $0).log
-    error_file=$LOG_DIR/$(basename $0).error
-
-    flock -n $log_file cat $TMP_DIR/$(basename $0).log >> $log_file
-    flock -n $error_file cat $TMP_DIR/$(basename $0).error >> $error_file
 }
@@ -16,7 +16,7 @@
 #               this tarball again; but if there are errors, we may
 #               keep it in TODO and try again, if the error is recoverable.
 #               Any errors are reported for possible action by an admin.
-#               
+#
 
 # assumptions:
 # - this script runs as a cron job
@@ -97,7 +97,7 @@ else
 
         link=$(readlink -e $result)
         if [ ! -f "$link" ] ;then
-            echo "$TS: $link does not exist" >&4
+            echo "$TS: $result->$link does not exist" >&4
             nerrs=$nerrs+1
             continue
         fi
@@ -119,15 +119,15 @@ else
             nerrs=$nerrs+1
             continue
         fi
-        
+
         mkdir -p $TMP/$PROG/$hostname
         status=$?
         if [[ $status -ne 0 ]] ;then
             echo "$TS: mkdir -p $TMP/$PROG/$hostname failed: code $status" >&4
             nerrs=$nerrs+1
             continue
         fi
-        
+
         # XXXX - for now, if it's a duplicate name, just punt and avoid producing the error - the full
         # solution will involve renaming the unpacked directory appropriately.
         if [ ${resultname%%.*} == "DUPLICATE__NAME" ] ;then
@@ -148,7 +148,7 @@ else
 
         # move any prefix file to the .prefix subdir
         basedir=$(dirname $link)
-        prefixfile=$basedir/prefix.$resultname
+        prefixfile=$basedir/$resultname.prefix
         if [ -f $prefixfile ] ;then
             mkdir -p $basedir/.prefix
             mv $prefixfile $basedir/.prefix