#!/usr/bin/ksh

# gbo_uc_rmd_mirror.ksh -  Using rsync, mirror the 
# Ground Magnetometer (GMAG) RMD files that UCalgary 
# collects from the GBO sites.
#
# (lphilpott Oct-2011) Calgary has changed the way in which data is made available as of Oct 2011.
# Previously we retrieved data using rsync. Now it is available on http, we retrieve using wget. 
# This script downloads files, writes file information to a database, writes a list of sites and dates
# that have been retrieved to a file for later processing to CDF.
# When rsync was used, download statistics were also written to a database. This is not currently enabled.
#
#	Usage: gbo_ucla_rmd_mirror.ksh [year] [month] [day] [duration]
#
#	Arguments:
#						$1 = year
#						$2 = month
#						$3 = day
#						$4 = duration
#
#									
# Creation Date:
#
#								22 march 2006 TimQuinn
#
#	

#
## Set the environment variables
#
  if [[ -z $THMSOC ]]
	then
		THMSOC=/disks/socware/thmsoc_dp_current
		export THMSOC
	fi
  . ${THMSOC}/src/config/soc_it_to_me.config
	. ${THMSOC}/src/config/gbo_uc_rmd_mirror.config

# Take input arguments and build mirror directories
#
	year=$1
	month=$2
	#day=$3
	args_dir_path=${year}/${month}/
	#if [[ -z $month && -z $day ]]
	#then
	#	args_dir_path=${args_dir_path%%//}
	#elif [[ -z $day ]]
	#then
	#	args_dir_path=${args_dir_path%/}
	#fi

	mirror_start_dir="$UCB_MIRROR_HOME/${args_dir_path}"
	mkdir -p $mirror_start_dir
	cd $mirror_start_dir


#
	echo "$$:" >> $LOGFILE
	echo "$$:----------- Starting gbo_uc_rmd_mirror at $(date) ---------" >> $LOGFILE

	start_time=$(date -u '+%Y-%m-%d %T')

	filenum=0

	WGET_LOGFILE=${WLOGFILE}_pid$$_${filenum}
	
			echo "Starting wget at directory level ${args_dir_path}" >> $LOGFILE
			echo "Starting wget at directory level ${args_dir_path}" >> $WGET_LOGFILE
		    echo "Wget Path Requested: ${WEBSITE_NAME}/${args_dir_path}" >> $WGET_LOGFILE
		
			remoteLocation=${WEBSITE_NAME}/${args_dir_path}
				wgetTime=$(date '+%Y-%m-%d %T')
				# -nH and --cut-dirs=4 gets rid of the web host name and top four directory levels
				# ie. the part: http://aurora.phys.ucalgary.ca/data/themis/fluxgate/stream0/ 
				# -np stops it from ascending through "directories"
				/usr/bin/wget -r -nH --cut-dirs=4 -N -o $WGET_LOGFILE -P $UCB_MIRROR_HOME -A 'RMD, LOD, HKP' -np ${remoteLocation}
				
			wgetcode=$?
			if (( $wgetcode != 0 ))
			then
				echo "WARNING WGET PROBLEM: Return code = $wgetcode" >> $LOGFILE
				printf "%s\n" "script: $0" \
				"Warning: UCalgary Download Problem:code $wgetcode: path ${args_dir_path}" |
				mailx -s "UCalgary Download Problem" $SOC_EMAIL
				msg="Warning: UCalgary Download Problem:code $wgetcode: path ${args_dir_path}"
				thmsoc_dblog.php $(basename $0) 1 "$msg"		### added 2012-06-25 - bsadeghi
			fi
				

	end_time=$(date -u '+%Y-%m-%d %T')

## Process log file created by wget run.
## Check wget results by seeing if any files were
## recovered or updated
		# look for all lines in the logfile that say "saved" but that don't say "index" or "robot"
		# this prints the filename and size to a temporary file
		
		grep saved $WGET_LOGFILE | grep -v 'index\|robot' | awk '{print $6 " " $8}' > /tmp/wget_gbo_uc_rmd$$
		previous=NaN
	while read line
		do
		path=$(echo $line |awk '{print $1}')
		size=$(echo $line |awk '{print $2}')

		fileName=$(basename $path)
		fileName=${fileName%*\'}

		# Parse date information from filename
		year=$(echo $fileName | awk -F_ '{print "20"substr($1,5,2)}')
		month=$(echo $fileName | awk -F_ '{print substr($1,7,2)}')
		day=$(echo $fileName |awk -F_ '{print substr($1,9,2)}')
		hr=$(echo $fileName | awk -F_ '{print $3}')
		mn=$(echo $fileName | awk -F_ '{print substr($4,1,2)}')
		dataTime="${year}-${month}-${day} ${hr}:${mn}:00" 

		# Verify file exists
		if [[ ! -a ${UCB_MIRROR_HOME}/${year}/${month}/${day}/${fileName} ]]
		then
		    echo "$$:gbo_uc_rmd_mirror_log:" >> $LOGFILE
		    echo "$$:gbo_uc_rmd_mirror_log: ${UCB_MIRROR_HOME}/${year}/${month}/${day}/${fileName} not found. " >> $LOGFILE
		    echo "$$:gbo_uc_rmd_mirror_log: continuing..................." >> $LOGFILE

			printf "%s\n" "script: $0" \
			"Warning: ${UCB_MIRROR_HOME}/${year}/${month}/${day}/${fileName} not found." \
			"Check $LOGFILE" |
			 mailx -s "GBO Calgary Download Discrepancy" $SOC_EMAIL
			msg="Warning: ${UCB_MIRROR_HOME}/${year}/${month}/${day}/${fileName} not found."
			thmsoc_dblog.php $(basename $0) 1 "$msg"		### added 2012-06-25 - bsadeghi
		fi

		fileSize=${size%\]}
		fileSize=${fileSize#\[}
		fileSize=$(echo $fileSize | awk -F/ '{print $1}')
		#for some reason size format seems to be in form [size/size] unlike for MACCS
		#here we have removed the brackets and only taken the first value
	
		# Update database with stats.
		# there is no relevant process code for wget (there was for rsync), but it is necessary to pass something to the database
		processCode='--------'
		processTime=$wgetTime

		schName=$(echo $fileName |awk -F_ '{print substr($1,1,4)}')

		mysql_table_name=gbo_${schName}_rmdfiles
		echo "$$:  Updating MySQL database table $mysql_table_name......." >> $LOGFILE
		echo "$$: gbo_uc_rmd_mirror.php $mysql_table_name $dataTime $fileName \
							$processTime $processCode $fileSize" >> $LOGFILE
 		gbo_uc_rmd_mirror.php $mysql_table_name \
					$dataTime $fileName $processTime $processCode $fileSize
		echo "$$:  ...........update complete. " >> $LOGFILE

		# Create a file used for CDF processing
		# only want one listing for each SITEYYMMDD not one for each hour
				sub_filename=$(echo $fileName |awk '{print substr($1,1,10)}')
				if [[ $sub_filename != $previous ]]
        then
        	echo $sub_filename >> $GMAGMIRROR/GBO_RMDdirs$$
         	previous=${sub_filename}
        fi

	done < /tmp/wget_gbo_uc_rmd$$
#
## Cleanup
#
	rm -f /tmp/wget_gbo_uc_rmd$$

		# Process wget statistics into database
		# NB: this is not running, as it is difficult to extract the relevant information from the wget output
		# total size and transfer rate are not given in necessarily fixed units - ie. could be KB sometimes and M some other times
		# It is uncertain whether the previous rsync statistics were used for anything.
		#rsync_dir=${year}/${month}/
		#	num_files=$(grep "Downloaded" $WGET_LOGFILE | awk '{print $2}')
		#	tot_size=$(grep "Downloaded" $WGET_LOGFILE | awk '{print $4}')
		#	xfer_rate=$(grep "Downloaded" $WGET_LOGFILE | awk '{print substr($8,2)}')
    #  echo "$$:  Updating MySQL database table gbo_ucla_rmd_rsync_stats......." >> $LOGFILE
		#	echo "$$: gbo_uc_rmd_mirror_stats.php \
		#		$rsync_dir $start_time $end_time $num_files $tot_size $xfer_rate" >> $LOGFILE			
    # gbo_uc_rmd_mirror_stats.php \
    #					$rsync_dir $start_time $end_time $num_files $tot_size $xfer_rate



	echo "$$:" >> $LOGFILE
	echo "$$:----------- Ending  gbo_uc_rmd_mirror_wget_test at $(date) ---------" >> $LOGFILE


exit 0