#!/usr/bin/ksh # gbo_uc_rmd_mirror.ksh - Using rsync, mirror the # Ground Magnetometer (GMAG) RMD files that UCalgary # collects from the GBO sites. # # (lphilpott Oct-2011) Calgary has changed the way in which data is made available as of Oct 2011. # Previously we retrieved data using rsync. Now it is available on http, we retrieve using wget. # This script downloads files, writes file information to a database, writes a list of sites and dates # that have been retrieved to a file for later processing to CDF. # When rsync was used, download statistics were also written to a database. This is not currently enabled. # # Usage: gbo_ucla_rmd_mirror.ksh [year] [month] [day] [duration] # # Arguments: # $1 = year # $2 = month # $3 = day # $4 = duration # # # Creation Date: # # 22 march 2006 TimQuinn # # # ## Set the environment variables # if [[ -z $THMSOC ]] then THMSOC=/disks/socware/thmsoc_dp_current export THMSOC fi . ${THMSOC}/src/config/soc_it_to_me.config . ${THMSOC}/src/config/gbo_uc_rmd_mirror.config # Take input arguments and build mirror directories # year=$1 month=$2 #day=$3 args_dir_path=${year}/${month}/ #if [[ -z $month && -z $day ]] #then # args_dir_path=${args_dir_path%%//} #elif [[ -z $day ]] #then # args_dir_path=${args_dir_path%/} #fi mirror_start_dir="$UCB_MIRROR_HOME/${args_dir_path}" mkdir -p $mirror_start_dir cd $mirror_start_dir # echo "$$:" >> $LOGFILE echo "$$:----------- Starting gbo_uc_rmd_mirror at $(date) ---------" >> $LOGFILE start_time=$(date -u '+%Y-%m-%d %T') filenum=0 WGET_LOGFILE=${WLOGFILE}_pid$$_${filenum} echo "Starting wget at directory level ${args_dir_path}" >> $LOGFILE echo "Starting wget at directory level ${args_dir_path}" >> $WGET_LOGFILE echo "Wget Path Requested: ${WEBSITE_NAME}/${args_dir_path}" >> $WGET_LOGFILE remoteLocation=${WEBSITE_NAME}/${args_dir_path} wgetTime=$(date '+%Y-%m-%d %T') # -nH and --cut-dirs=4 gets rid of the web host name and top four directory levels # ie. the part: http://aurora.phys.ucalgary.ca/data/themis/fluxgate/stream0/ # -np stops it from ascending through "directories" /usr/bin/wget -r -nH --cut-dirs=4 -N -o $WGET_LOGFILE -P $UCB_MIRROR_HOME -A 'RMD, LOD, HKP' -np ${remoteLocation} wgetcode=$? if (( $wgetcode != 0 )) then echo "WARNING WGET PROBLEM: Return code = $wgetcode" >> $LOGFILE printf "%s\n" "script: $0" \ "Warning: UCalgary Download Problem:code $wgetcode: path ${args_dir_path}" | mailx -s "UCalgary Download Problem" $SOC_EMAIL msg="Warning: UCalgary Download Problem:code $wgetcode: path ${args_dir_path}" thmsoc_dblog.php $(basename $0) 1 "$msg" ### added 2012-06-25 - bsadeghi fi end_time=$(date -u '+%Y-%m-%d %T') ## Process log file created by wget run. ## Check wget results by seeing if any files were ## recovered or updated # look for all lines in the logfile that say "saved" but that don't say "index" or "robot" # this prints the filename and size to a temporary file grep saved $WGET_LOGFILE | grep -v 'index\|robot' | awk '{print $6 " " $8}' > /tmp/wget_gbo_uc_rmd$$ previous=NaN while read line do path=$(echo $line |awk '{print $1}') size=$(echo $line |awk '{print $2}') fileName=$(basename $path) fileName=${fileName%*\'} # Parse date information from filename year=$(echo $fileName | awk -F_ '{print "20"substr($1,5,2)}') month=$(echo $fileName | awk -F_ '{print substr($1,7,2)}') day=$(echo $fileName |awk -F_ '{print substr($1,9,2)}') hr=$(echo $fileName | awk -F_ '{print $3}') mn=$(echo $fileName | awk -F_ '{print substr($4,1,2)}') dataTime="${year}-${month}-${day} ${hr}:${mn}:00" # Verify file exists if [[ ! -a ${UCB_MIRROR_HOME}/${year}/${month}/${day}/${fileName} ]] then echo "$$:gbo_uc_rmd_mirror_log:" >> $LOGFILE echo "$$:gbo_uc_rmd_mirror_log: ${UCB_MIRROR_HOME}/${year}/${month}/${day}/${fileName} not found. " >> $LOGFILE echo "$$:gbo_uc_rmd_mirror_log: continuing..................." >> $LOGFILE printf "%s\n" "script: $0" \ "Warning: ${UCB_MIRROR_HOME}/${year}/${month}/${day}/${fileName} not found." \ "Check $LOGFILE" | mailx -s "GBO Calgary Download Discrepancy" $SOC_EMAIL msg="Warning: ${UCB_MIRROR_HOME}/${year}/${month}/${day}/${fileName} not found." thmsoc_dblog.php $(basename $0) 1 "$msg" ### added 2012-06-25 - bsadeghi fi fileSize=${size%\]} fileSize=${fileSize#\[} fileSize=$(echo $fileSize | awk -F/ '{print $1}') #for some reason size format seems to be in form [size/size] unlike for MACCS #here we have removed the brackets and only taken the first value # Update database with stats. # there is no relevant process code for wget (there was for rsync), but it is necessary to pass something to the database processCode='--------' processTime=$wgetTime schName=$(echo $fileName |awk -F_ '{print substr($1,1,4)}') mysql_table_name=gbo_${schName}_rmdfiles echo "$$: Updating MySQL database table $mysql_table_name......." >> $LOGFILE echo "$$: gbo_uc_rmd_mirror.php $mysql_table_name $dataTime $fileName \ $processTime $processCode $fileSize" >> $LOGFILE gbo_uc_rmd_mirror.php $mysql_table_name \ $dataTime $fileName $processTime $processCode $fileSize echo "$$: ...........update complete. " >> $LOGFILE # Create a file used for CDF processing # only want one listing for each SITEYYMMDD not one for each hour sub_filename=$(echo $fileName |awk '{print substr($1,1,10)}') if [[ $sub_filename != $previous ]] then echo $sub_filename >> $GMAGMIRROR/GBO_RMDdirs$$ previous=${sub_filename} fi done < /tmp/wget_gbo_uc_rmd$$ # ## Cleanup # rm -f /tmp/wget_gbo_uc_rmd$$ # Process wget statistics into database # NB: this is not running, as it is difficult to extract the relevant information from the wget output # total size and transfer rate are not given in necessarily fixed units - ie. could be KB sometimes and M some other times # It is uncertain whether the previous rsync statistics were used for anything. #rsync_dir=${year}/${month}/ # num_files=$(grep "Downloaded" $WGET_LOGFILE | awk '{print $2}') # tot_size=$(grep "Downloaded" $WGET_LOGFILE | awk '{print $4}') # xfer_rate=$(grep "Downloaded" $WGET_LOGFILE | awk '{print substr($8,2)}') # echo "$$: Updating MySQL database table gbo_ucla_rmd_rsync_stats......." >> $LOGFILE # echo "$$: gbo_uc_rmd_mirror_stats.php \ # $rsync_dir $start_time $end_time $num_files $tot_size $xfer_rate" >> $LOGFILE # gbo_uc_rmd_mirror_stats.php \ # $rsync_dir $start_time $end_time $num_files $tot_size $xfer_rate echo "$$:" >> $LOGFILE echo "$$:----------- Ending gbo_uc_rmd_mirror_wget_test at $(date) ---------" >> $LOGFILE exit 0