#!/bin/sh
#
#
#___INFO__MARK_BEGIN__
##########################################################################
#
#  The Contents of this file are made available subject to the terms of
#  the Sun Industry Standards Source License Version 1.2
#
#  Sun Microsystems Inc., March, 2001
#
#
#  Sun Industry Standards Source License Version 1.2
#  =================================================
#  The contents of this file are subject to the Sun Industry Standards
#  Source License Version 1.2 (the "License"); You may not use this file
#  except in compliance with the License. You may obtain a copy of the
#  License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
#
#  Software provided under this License is provided on an "AS IS" basis,
#  WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
#  WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
#  MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
#  See the License for the specific provisions governing your rights and
#  obligations concerning the Software.
#
#  The Initial Developer of the Original Code is: Sun Microsystems, Inc.
#
#  Copyright: 2001 by Sun Microsystems, Inc.
#
#  All Rights Reserved.
#
##########################################################################
#___INFO__MARK_END__

set +u

ckpt_dir=$3

if [ ! -f $ckpt_dir/ckpt.log ]; then
   touch $ckpt_dir/ckpt.log
   chmod 666 $ckpt_dir/ckpt.log
fi

# create temp directory for holding checkpoint info

tmpdir=$ckpt_dir/ckpt.$1
mkdir -p $tmpdir
cd $tmpdir

# create log file

#F=$tmpdir/checkpoint.log
F=$HOME/$REQUEST.co$1
touch $F

echo -------------------------------------------------------------  >> $F
echo "`basename $0` called at `date`"      >> $F
echo "called by: `id`"			 >> $F
echo "with args: $*"			 >> $F

echo "`date +"%D %T"` Job $1 \"(pid=$2)\" restarting" >> $ckpt_dir/ckpt.log

currcpr=`cat currcpr`
if [ "$currcpr" != "2" ]; then
   currcpr=1
fi
echo Restart command: cpr -r cpr_$1.$currcpr >> $F
cpr -r cpr_$1.$currcpr >> $F 2>&1

# Now be careful: The restart command is the parent process of the restarted
# job. SGE is the parent process of the restart command.
# If the job was killed (probably due to a migration request), we need to
# tell our parent that by killing ourselves. SGE will also detect an 
# exit status > 128 analogous to a KILL

exit_status=$?
echo "Exit status of restart command: $exit_status" >> $F

# poll for job completion (based on job script pid)

if [ $exit_status -gt 0 ]; then
   exit_status=100
else
   while ps -p $2
   do
      sleep 2
   done
fi

# This doesn't work under Irix 6.2, since the variable $$ is not
# correctly set
#if [ $exit_status -gt 128 ]; then
#   signal=`expr $exit_status - 128`
#   echo Killing ourself: kill -$signal $$        >> $F 2>&1
#   /usr/bin/kill -$signal $pid                   >> $F 2>&1
#fi

# If killing ourselves didn't help or the exit_status was  < 128 exit 
# with the exit status of our child

echo `date +"%D %T"` Job $1 "(pid=$2) exiting, status=$exit_status" >> $ckpt_dir/ckpt.log

echo Exiting with exit status: $exit_status      >> $F
exit $exit_status
