;+
; The documentation is not yet correct.
; NAME:
;    file_http_copy
;
; PURPOSE:
;    Use the IDL SOCKET procedure to get files from http servers
;
; EXPLANATION:
;     FILE_HTTP_COPY can access http servers - even from behind a firewall -
;     and perform simple downloads. Currently,
;     Requires IDL V5.4 or later on Unix or Windows, V5.6 on
;     Macintosh
;
; CALLING SEQUENCE:
;      FILE_HTTP_COPY,url,serverdir=serverdir,localdir=localdir,pathname=pathname
;
; INPUTS:
;      URL - scalar string giving a fully qualified url of the form
;          'http://server.eso.org/path/file.html'.
; KEYWORDS:
;
;
; Examples:
;   ;Download most recent version of this file to current directory:
;   FILE_HTTP_COPY,'http://themis.ssl.berkeley.edu/data/themis/socware/bleeding_edge/idl/ssl_general/misc/file_http_copy.pro'
;
; OPTIONAL INPUT KEYWORD PARAMETERS:
;       PATHNAME = pathname   ; pathname is the filename to be created.
;                If the directory does not exist then it will be created.
;                If PATHNAME does not exist then the original filename is used
;                and placed in the current directory.
;
;       /SILENT - If set, the information error messages are suppressed
;
; RESTRICTIONS:
;
;     PROXY: If you are behind a firewall and have to access the net through a
;         Web proxy,  set the environment variable 'http_proxy' to point to
;         your proxy server and port, e.g.
;         'setenv http_proxy=http://web-proxy.mpia-hd.mpg.de:3128'
;
;               The URL *MUST* begin with "http://".
;
; PROCEDURE:
;     Open a socket to the webserver and download the header.
;
; EXAMPLE:
;      IDL> FILE_HTTP_COPY,'http://themis.ssl.berkeley.edu/themisdata/thg/l1/asi/whit/2006/thg_l1_asf_whit_2006010103_v01.cdf'
;      IDL> PRINTDAT, file_info('thg_l1_asf_whit_2006010103_v01.cdf')
;      or
;
;
; MINIMUM IDL VERSION:
;     V5.4  (uses SOCKET)
; MODIFICATION HISTORY:
;   Original version:  WEBGET()
;     Written by M. Feldt, Heidelberg, Oct 2001 <mfeldt@mpia.de>
;     Use /swap_if_little_endian keyword to SOCKET  W. Landsman August 2002
;     Less restrictive search on Content-Type   W. Landsman   April 2003
;     Modified to work with FIRST image server-  A. Barth, Nov 2006
;   FILE_HTTP_COPY:   New version created by D Larson:  March 2007.
;     Very heavily modified:
;
; $LastChangedBy: pcruce $
; $LastChangedDate: 2007-07-23 14:18:44 -0700 (Mon, 23 Jul 2007) $
; $LastChangedRevision: 1221 $
; $URL: svn+ssh://thmsvn@ambrosia.ssl.berkeley.edu/repos/ssl_general/tags/tdas_2_02/misc/file_http_copy.pro $
;-

;+
;FUNCTION extract_html_links(filename,count)
;PURPOSE:  returns relative links within an html file
;INPUT:  filename: (string) valid filename
;OUTPUT:  count:  number of links found
;-
function extract_html_links,filename,count   ; Links with '*' or '?' are removed.
   count=0
   on_ioerror, badfile
   openr,lun,filename,/get_lun
   s=''
   links = ''
   while not eof(lun) do begin
      readf,lun,s
      p0 = strpos(strlowcase(s),'<a href="')
      if p0 ge 0 then begin
         p1 = strpos(s,'">',p0)
         if p1 ge p0+9 then begin
            link = strmid(s,p0+9,p1-p0-9)
            links = [links,link]
         endif
      endif
     ; print,s
   endwhile
   free_lun,lun
   bad = strlen(links) eq 0
   bad = (strpos(links,'?') ge 0) or bad
   bad = (strpos(links,'*') ge 0) or bad
   bad = (strpos(links,'/') eq 0) or bad   ; remove absolute links (start with '/')
   w = where(bad eq 0,count)
   return,count gt 0 ? links[w] : ''
   badfile:
   dprint,'Bad file: '+filename
   ;beep
   ;stop
   return,''
end



PRO file_http_copy_MimeType,  Header, Class=class, Type=type, Length=length, date=date, last_modified=last_modified,found=found
;;
;; MIME type recognition
;
  Class = 'text'
  Type = 'simple'               ; in case no information found...
  def = strupcase(strmid(header,0,13))
  g = where(def EQ 'CONTENT-TYPE:', Ng)
  if Ng GT 0 then begin
       ClassAndType = strmid(Header[g[0]], 14, strlen(Header[g[0]])-1)
       Class = (strsplit(ClassAndType, '/', /extract))[0]
       Type = (strsplit(ClassAndType, '/', /extract))[1]
  ENDIF
  if arg_present(found) then begin
       pos = strpos(strupcase(header[0]),'200 OK')
       found = pos ge 0
  endif
  if arg_present(length) then begin
       def = strupcase(strmid(header,0,15))
       g = where(def EQ 'CONTENT-LENGTH:', Ng)
       if Ng GT 0 then  Length = long(strmid(Header[g[0]], 15, strlen(Header[g[0]])-1)) $
       else length = 0
  endif
  if arg_present(date) then begin
       tok = 'DATE:'
       ltok = strlen(tok)
       def = strupcase(strmid(header,0,ltok))
       g = where(def EQ tok, Ng)
       if Ng GT 0 then    date = strmid(Header[g[0]], ltok, strlen(Header[g[0]])-1)  $
       else date = ''
  endif
  if arg_present(last_modified) then begin
       tok = 'LAST-MODIFIED:'
       ltok = strlen(tok)
       def = strupcase(strmid(header,0,ltok))
       g = where(def EQ tok, Ng)
       if Ng GT 0 then  last_modified = strmid(Header[g[0]], ltok, strlen(Header[g[0]])-1)  $
       else last_modified = ''
  endif
END




PRO file_http_copy,  url,  SILENT=silent, $
     PATHNAME=pathname, verbose=verbose, $
     serverdir=serverdir,  $       ; input:  (string) URL of source files: ie:  'http://themis.ssl.berkeley.edu/data/themis/'      ;trailing '/' is required
     localdir=localdir, $          ; input:  (string) destination directory i.e.:  'e:/data/themis/'        ;trailing '/' is required
     localnames=localname, $       ; output:  Downloaded filenames are returned in this variable
     last_version=last_version, $
     min_age_limit=min_age_limit, $
     progobj=progobj, $   ; This keyword is experimental - please don't count on it
     error = error
  ;;
  ;;
  ;; sockets supported in unix & windows since V5.4, Macintosh since V5.6
  tstart = systime(1)
  dprint,dlevel=4,'Start; $Id: file_http_copy.pro 1221 2007-07-23 21:18:44Z pcruce $'
  localname=''

  if n_elements(verbose) ne 0 then dprint,setdebug=verbose,getdebug=last_dbg

  if not keyword_set(url) then url = serverdir+pathname
  if n_elements(pathname) eq 0 then pathname = file_basename(url)
  if not keyword_set(localdir) then localdir = ''
  dprint,dlevel=4,/phelp,serverdir
  dprint,dlevel=4,/phelp,localdir
  dprint,dlevel=4,/phelp,pathname
  dprint,dlevel=4,/phelp,url
  indexfilename =  '.remote-index.html'

  globpos = min( uint( [strpos(pathname,'*'),strpos(pathname,'?'),strpos(pathname,'['),strpos(pathname,']')] ) )
  if globpos le 1000 then begin   ; Look for globbed  ([*?]) filenames
     dprint,dlevel=4,'Warning! Using Globbing!'
     slash='/'
     slashpos1 = strpos(pathname,slash,globpos,/reverse_search)
     sub_pathname = strmid(pathname,0,slashpos1+1)
     dprint,dlevel=5,/phelp,sub_pathname
     file_http_copy,serverdir=serverdir,localdir=localdir,pathname=sub_pathname,localname=indexfilepath,min_age_limit=min_age_limit
     links = extract_html_links(indexfilepath)
     dprint,dlevel=5,/phelp,links
     slashpos2 = strpos(pathname,slash,globpos)
     if slashpos2 eq -1 then slashpos2 = strlen(pathname)  ; special case for non-directories  (files)
     sup_pathname = strmid(pathname,0,slashpos2+1)
     end_pathname = strmid(pathname,slashpos2+1)
     w = where(strmatch(sub_pathname+links,sup_pathname),nlinks)
     if nlinks gt 0 then begin
         rec_pathnames = sub_pathname + links[w] + end_pathname
         dprint,dlevel=5,/phelp,sup_pathname
         dprint,dlevel=5,/phelp,end_pathname
         dprint,dlevel=5,/phelp,rec_pathnames
         if keyword_set(last_version) then i0 = nlinks-1  else i0=0L
         for i=i0,nlinks-1 do begin
             dprint,dlevel=4,'Retrieve link#'+strtrim(i,2),' ', rec_pathnames[i]
             file_http_copy,serverdir=serverdir,localdir=localdir,pathname=rec_pathnames[i]  $
                       , min_age_limit=min_age_limit, last_version=last_version, localnames=lns
             dprint,dlevel=5,/phelp,lns
             if keyword_set(lns) then localname = keyword_set(localname) ?  [localname,lns] : lns
             dprint,dlevel=5,/phelp,localname
         endfor
     endif else begin
         dprint,dlevel=3,'No files found matching: '+sup_pathname
     endelse
     goto, final
  endif             ;  End of globbed filenames

  localname = localdir + pathname
  if strmid(localname,0,1,/reverse_offset) eq '/' then localname = localname + indexfilename
  lcl = file_info(localname)

  if tstart-lcl.mtime lt (keyword_set(min_age_limit) ? min_age_limit : 0) then begin
      dprint,dlevel=1,'Found recent file: "'+localname+'" (assumed valid)'
      goto, final   ;not working yet
  endif


  ;;
  ;; open the connection and request the file
  ;;
  read_timeout = 30
  Proxy = getenv('http_proxy')
  IF Proxy NE '' THEN BEGIN
      ;;
      ;; sort out proxy name
      ;;
      LastColon = StrPos(Proxy, ':', /Reverse_Search)
      ProxyPort = fix(StrMid(Proxy, LastColon+1, StrLen(Proxy)))
      ProxyServer = StrMid(Proxy, 7, LastColon-7)
      ;; open the connection and send the 'GET' command
      ProtocolString = " HTTP/1.0 User-Agent: IDL/"+!version.release
      socket, unit, ProxyServer,  ProxyPort, /get_lun, /swap_if_little_endian
      printf, unit, 'GET '+url+ProtocolString
      printf, unit, ''          ; a second carriage return is needed by proxies
  ENDIF ELSE BEGIN
      ;;
      ;; same thing easier without proxy
      ;;
      slash1 = StrPos(strmid(url, 7, StrLen(url)), '/')
      Server = StrMid(url, 7, slash1 )
      purl = strmid(url,slash1+7, StrLen(url))
      Port = 80
      dprint,dlevel=4,'Opening server: ',server
      socket, unit, Server,  Port, /get_lun,/swap_if_little_endian,error=error,read_timeout=read_timeout
      if error eq 0 then begin
          printf, unit, 'GET '+purl +  ' HTTP/1.0'
          printf, unit, 'HTTP/1.0 User-Agent:  IDL ' + !VERSION.RELEASE + ' on ' + !VERSION.OS + '/' + !VERSION.ARCH
          printf, unit, ''
      endif else begin
          If(n_elements(unit) Gt 0) Then free_lun, unit;jmm, 19-jun-2007 for cases where unit is undefined
          dprint,dlevel=0,!error_state.msg
          goto, final
      endelse

  ENDELSE

  LinesRead = 0
  text = 'xxx'
  ;;
  ;; now read the header
  ;;
On_IOERROR, done

  Header = strarr(256)
  WHILE  text NE '' do begin
      readf, unit, text
      Header[LinesRead] = text
      LinesRead = LinesRead+1
      IF LinesRead MOD 256 EQ 0 THEN $
        Header=[Header, StrArr(256)]
  ENDWHILE
DONE: On_IOERROR, NULL
  ;;
  if LinesRead EQ 0 then begin
     free_lun, unit
     dprint,dlevel=0,!error_state.msg
     goto, final
  endif

  Header = Header[0:LinesRead-1]
  file_http_copy_MimeType, Header, class=Class,  type=Type, length=Length, date=date, last=last, found=found  ; analyze the header

  modtime = keyword_set(last) ? str2time(last, informat = 'DMYhms') : systime(1)
  clock_offset = str2time(date,informat = 'DMYhms') - tstart
  dprint,dlevel=6,'Header= ',transpose(header)
  dprint,dlevel=5,/phelp,class
  dprint,dlevel=5,/phelp,type
  dprint,dlevel=5,/phelp,length
  dprint,dlevel=5,/phelp,date
  dprint,dlevel=5,/phelp,last
  ;;
  if abs(clock_offset) gt 30 then dprint,dlevel=1,'Warning! Your Clock is off by:',clock_offset,' Seconds

  dprint,dlevel=4,'localname="'+localname+'"'

  if found then begin
      tdiff = (modtime - lcl.mtime) / 24./3600.  ; days old
      MB = 2.^20
      if lcl.exists   then  begin
          dprint,dlevel=5,'tdiff=',tdiff,' days'
          if tdiff gt 0  then    dprint,dlevel=4, format="('Updating ',f0.1,' day old file: ',a  )", tdiff, localname
          if  lcl.size ne length then begin 
              dprint,dlevel=1,length/mb,lcl.size/mb, file_basename(localname), format='("Warning! Different file sizes: Remote=",f0.3," MB, Local=",f0.3," MB file: ",a)'
              dprint,dlevel=1,purl,format='("Warning! Local file may be corrupted, if so please delete it and try again. File Name: DATADIR",a)'

          endif
      endif else begin
          dprint,dlevel=3,format="('Found new (',f0.3,' MB) file: ""',a,'""')",length/mb,url
      endelse
      if lcl.exists eq 0 or tdiff ge 0 then begin    ; download file
          dirname = file_dirname(localname)
          if file_test(dirname,/dir) eq 0 then begin
              dprint,dlevel=3,'Creating new directory: "'+dirname+'"'
              file_mkdir,dirname
          endif
          On_IOERROR, file_error2
          openw, wunit, localname, /get_lun
          ts = systime(1)
          t0 = systime(1)
          if length eq 0 and  class eq 'text' then begin         ; Text file download
            dprint,dlevel=3,'Downloading as a text file.'
            while  eof(unit) EQ 0 do begin
              readf, unit, text
              printf, wunit, text
            endwhile
          endif else begin                      ; Non-text (binary) files
            maxb = 2l^20   ; 1 Megabyte default buffer size
            nb=0l
            b=0l
            while nb lt length do begin
              buffsize = maxb  <  (length-nb)
              aaa = bytarr(buffsize,/nozero)
              readu, unit, aaa
              writeu, wunit, aaa
              nb += buffsize
              t1 = systime(1)
              dt = t1-t0
              b += buffsize
              percent = 100.*float(nb)/length
              if (dt gt 5.) and (nb lt length) then begin   ; Wait 5 seconds between updates.
                 rate = b/mb/dt                             ; This will only display if the filesize (LENGTH) is greater than MAXB
                 eta = (length-nb)/mb/rate +t1 - tstart
                 messstr = string(format='("  ",f5.1," %  (",f0.1,"/",f0.1," secs)  @ ",f0.2," MB/s")', percent, t1-tstart,eta, rate ,/print)
                 t0 = t1
                 b =0l
                 dprint,dlevel=1,messstr   &  wait,.01
                 if obj_valid(progobj)  then begin
                     progobj->update,percent,text=messstr
                     if progobj->checkcancel() then message,'Download cancelled by user',/ioerror
                 endif
              endif
            endwhile
            t1 = systime(1)
            dt = t1 - tstart
            messstr = string(/print,format = "('Downloaded ',f0.3,' MBytes in ',f0.1,' secs @ ',f0.2,' MB/s  File: ""', a,'""' )",nb/mb,dt,nb/mb/dt,localname )
            dprint,dlevel=2,messstr
            if obj_valid(progobj)  then begin
              progobj->update,percent,text=messstr
            endif
          endelse
          free_lun, wunit
          if 0 then begin
              file_error2:
              beep
              dprint,dlevel=0,'Error downloading file: "',url,'"'
              error = !error_state.msg
              dprint,dlevel=0,error
              dprint,dlevel=0,'Deleting: "' + lcl.name +'"'
              if obj_valid(progobj)  then begin
                 progobj->update,0.,text=error
              endif
              if keyword_set(wunit) then free_lun, wunit
              file_delete,lcl.name
          endif
      endif else dprint,'Found local  file: "' + localname + '" (No download needed)'
  endif else dprint,'Remote file not found! "'+ url + '"'
  free_lun, unit
  dprint,dlevel=4,'Closing server: ',server
  ;;
final:
if n_elements(verbose) ne 0 then dprint,setdebug=last_dbg           ; Reset previous debug level.
return

END