;+ ; The documentation is not yet correct. ; NAME: ; file_http_copy ; ; PURPOSE: ; Use the IDL SOCKET procedure to get files from http servers ; ; EXPLANATION: ; FILE_HTTP_COPY can access http servers - even from behind a firewall - ; and perform simple downloads. Currently, ; Requires IDL V5.4 or later on Unix or Windows, V5.6 on ; Macintosh ; ; CALLING SEQUENCE: ; FILE_HTTP_COPY,url,serverdir=serverdir,localdir=localdir,pathname=pathname ; ; INPUTS: ; URL - scalar string giving a fully qualified url of the form ; 'http://server.eso.org/path/file.html'. ; KEYWORDS: ; ; ; Examples: ; ;Download most recent version of this file to current directory: ; FILE_HTTP_COPY,'http://themis.ssl.berkeley.edu/data/themis/socware/bleeding_edge/idl/ssl_general/misc/file_http_copy.pro' ; ; OPTIONAL INPUT KEYWORD PARAMETERS: ; PATHNAME = pathname ; pathname is the filename to be created. ; If the directory does not exist then it will be created. ; If PATHNAME does not exist then the original filename is used ; and placed in the current directory. ; ; /SILENT - If set, the information error messages are suppressed ; ; RESTRICTIONS: ; ; PROXY: If you are behind a firewall and have to access the net through a ; Web proxy, set the environment variable 'http_proxy' to point to ; your proxy server and port, e.g. ; 'setenv http_proxy=http://web-proxy.mpia-hd.mpg.de:3128' ; ; The URL *MUST* begin with "http://". ; ; PROCEDURE: ; Open a socket to the webserver and download the header. ; ; EXAMPLE: ; IDL> FILE_HTTP_COPY,'http://themis.ssl.berkeley.edu/themisdata/thg/l1/asi/whit/2006/thg_l1_asf_whit_2006010103_v01.cdf' ; IDL> PRINTDAT, file_info('thg_l1_asf_whit_2006010103_v01.cdf') ; or ; ; ; MINIMUM IDL VERSION: ; V5.4 (uses SOCKET) ; MODIFICATION HISTORY: ; Original version: WEBGET() ; Written by M. Feldt, Heidelberg, Oct 2001 <mfeldt@mpia.de> ; Use /swap_if_little_endian keyword to SOCKET W. Landsman August 2002 ; Less restrictive search on Content-Type W. Landsman April 2003 ; Modified to work with FIRST image server- A. Barth, Nov 2006 ; FILE_HTTP_COPY: New version created by D Larson: March 2007. ; Very heavily modified: ; ; $LastChangedBy: pcruce $ ; $LastChangedDate: 2007-07-23 14:18:44 -0700 (Mon, 23 Jul 2007) $ ; $LastChangedRevision: 1221 $ ; $URL: svn+ssh://thmsvn@ambrosia.ssl.berkeley.edu/repos/ssl_general/tags/tdas_2_02/misc/file_http_copy.pro $ ;- ;+ ;FUNCTION extract_html_links(filename,count) ;PURPOSE: returns relative links within an html file ;INPUT: filename: (string) valid filename ;OUTPUT: count: number of links found ;- function extract_html_links,filename,count ; Links with '*' or '?' are removed. count=0 on_ioerror, badfile openr,lun,filename,/get_lun s='' links = '' while not eof(lun) do begin readf,lun,s p0 = strpos(strlowcase(s),'<a href="') if p0 ge 0 then begin p1 = strpos(s,'">',p0) if p1 ge p0+9 then begin link = strmid(s,p0+9,p1-p0-9) links = [links,link] endif endif ; print,s endwhile free_lun,lun bad = strlen(links) eq 0 bad = (strpos(links,'?') ge 0) or bad bad = (strpos(links,'*') ge 0) or bad bad = (strpos(links,'/') eq 0) or bad ; remove absolute links (start with '/') w = where(bad eq 0,count) return,count gt 0 ? links[w] : '' badfile: dprint,'Bad file: '+filename ;beep ;stop return,'' end PRO file_http_copy_MimeType, Header, Class=class, Type=type, Length=length, date=date, last_modified=last_modified,found=found ;; ;; MIME type recognition ; Class = 'text' Type = 'simple' ; in case no information found... def = strupcase(strmid(header,0,13)) g = where(def EQ 'CONTENT-TYPE:', Ng) if Ng GT 0 then begin ClassAndType = strmid(Header[g[0]], 14, strlen(Header[g[0]])-1) Class = (strsplit(ClassAndType, '/', /extract))[0] Type = (strsplit(ClassAndType, '/', /extract))[1] ENDIF if arg_present(found) then begin pos = strpos(strupcase(header[0]),'200 OK') found = pos ge 0 endif if arg_present(length) then begin def = strupcase(strmid(header,0,15)) g = where(def EQ 'CONTENT-LENGTH:', Ng) if Ng GT 0 then Length = long(strmid(Header[g[0]], 15, strlen(Header[g[0]])-1)) $ else length = 0 endif if arg_present(date) then begin tok = 'DATE:' ltok = strlen(tok) def = strupcase(strmid(header,0,ltok)) g = where(def EQ tok, Ng) if Ng GT 0 then date = strmid(Header[g[0]], ltok, strlen(Header[g[0]])-1) $ else date = '' endif if arg_present(last_modified) then begin tok = 'LAST-MODIFIED:' ltok = strlen(tok) def = strupcase(strmid(header,0,ltok)) g = where(def EQ tok, Ng) if Ng GT 0 then last_modified = strmid(Header[g[0]], ltok, strlen(Header[g[0]])-1) $ else last_modified = '' endif END PRO file_http_copy, url, SILENT=silent, $ PATHNAME=pathname, verbose=verbose, $ serverdir=serverdir, $ ; input: (string) URL of source files: ie: 'http://themis.ssl.berkeley.edu/data/themis/' ;trailing '/' is required localdir=localdir, $ ; input: (string) destination directory i.e.: 'e:/data/themis/' ;trailing '/' is required localnames=localname, $ ; output: Downloaded filenames are returned in this variable last_version=last_version, $ min_age_limit=min_age_limit, $ progobj=progobj, $ ; This keyword is experimental - please don't count on it error = error ;; ;; ;; sockets supported in unix & windows since V5.4, Macintosh since V5.6 tstart = systime(1) dprint,dlevel=4,'Start; $Id: file_http_copy.pro 1221 2007-07-23 21:18:44Z pcruce $' localname='' if n_elements(verbose) ne 0 then dprint,setdebug=verbose,getdebug=last_dbg if not keyword_set(url) then url = serverdir+pathname if n_elements(pathname) eq 0 then pathname = file_basename(url) if not keyword_set(localdir) then localdir = '' dprint,dlevel=4,/phelp,serverdir dprint,dlevel=4,/phelp,localdir dprint,dlevel=4,/phelp,pathname dprint,dlevel=4,/phelp,url indexfilename = '.remote-index.html' globpos = min( uint( [strpos(pathname,'*'),strpos(pathname,'?'),strpos(pathname,'['),strpos(pathname,']')] ) ) if globpos le 1000 then begin ; Look for globbed ([*?]) filenames dprint,dlevel=4,'Warning! Using Globbing!' slash='/' slashpos1 = strpos(pathname,slash,globpos,/reverse_search) sub_pathname = strmid(pathname,0,slashpos1+1) dprint,dlevel=5,/phelp,sub_pathname file_http_copy,serverdir=serverdir,localdir=localdir,pathname=sub_pathname,localname=indexfilepath,min_age_limit=min_age_limit links = extract_html_links(indexfilepath) dprint,dlevel=5,/phelp,links slashpos2 = strpos(pathname,slash,globpos) if slashpos2 eq -1 then slashpos2 = strlen(pathname) ; special case for non-directories (files) sup_pathname = strmid(pathname,0,slashpos2+1) end_pathname = strmid(pathname,slashpos2+1) w = where(strmatch(sub_pathname+links,sup_pathname),nlinks) if nlinks gt 0 then begin rec_pathnames = sub_pathname + links[w] + end_pathname dprint,dlevel=5,/phelp,sup_pathname dprint,dlevel=5,/phelp,end_pathname dprint,dlevel=5,/phelp,rec_pathnames if keyword_set(last_version) then i0 = nlinks-1 else i0=0L for i=i0,nlinks-1 do begin dprint,dlevel=4,'Retrieve link#'+strtrim(i,2),' ', rec_pathnames[i] file_http_copy,serverdir=serverdir,localdir=localdir,pathname=rec_pathnames[i] $ , min_age_limit=min_age_limit, last_version=last_version, localnames=lns dprint,dlevel=5,/phelp,lns if keyword_set(lns) then localname = keyword_set(localname) ? [localname,lns] : lns dprint,dlevel=5,/phelp,localname endfor endif else begin dprint,dlevel=3,'No files found matching: '+sup_pathname endelse goto, final endif ; End of globbed filenames localname = localdir + pathname if strmid(localname,0,1,/reverse_offset) eq '/' then localname = localname + indexfilename lcl = file_info(localname) if tstart-lcl.mtime lt (keyword_set(min_age_limit) ? min_age_limit : 0) then begin dprint,dlevel=1,'Found recent file: "'+localname+'" (assumed valid)' goto, final ;not working yet endif ;; ;; open the connection and request the file ;; read_timeout = 30 Proxy = getenv('http_proxy') IF Proxy NE '' THEN BEGIN ;; ;; sort out proxy name ;; LastColon = StrPos(Proxy, ':', /Reverse_Search) ProxyPort = fix(StrMid(Proxy, LastColon+1, StrLen(Proxy))) ProxyServer = StrMid(Proxy, 7, LastColon-7) ;; open the connection and send the 'GET' command ProtocolString = " HTTP/1.0 User-Agent: IDL/"+!version.release socket, unit, ProxyServer, ProxyPort, /get_lun, /swap_if_little_endian printf, unit, 'GET '+url+ProtocolString printf, unit, '' ; a second carriage return is needed by proxies ENDIF ELSE BEGIN ;; ;; same thing easier without proxy ;; slash1 = StrPos(strmid(url, 7, StrLen(url)), '/') Server = StrMid(url, 7, slash1 ) purl = strmid(url,slash1+7, StrLen(url)) Port = 80 dprint,dlevel=4,'Opening server: ',server socket, unit, Server, Port, /get_lun,/swap_if_little_endian,error=error,read_timeout=read_timeout if error eq 0 then begin printf, unit, 'GET '+purl + ' HTTP/1.0' printf, unit, 'HTTP/1.0 User-Agent: IDL ' + !VERSION.RELEASE + ' on ' + !VERSION.OS + '/' + !VERSION.ARCH printf, unit, '' endif else begin If(n_elements(unit) Gt 0) Then free_lun, unit;jmm, 19-jun-2007 for cases where unit is undefined dprint,dlevel=0,!error_state.msg goto, final endelse ENDELSE LinesRead = 0 text = 'xxx' ;; ;; now read the header ;; On_IOERROR, done Header = strarr(256) WHILE text NE '' do begin readf, unit, text Header[LinesRead] = text LinesRead = LinesRead+1 IF LinesRead MOD 256 EQ 0 THEN $ Header=[Header, StrArr(256)] ENDWHILE DONE: On_IOERROR, NULL ;; if LinesRead EQ 0 then begin free_lun, unit dprint,dlevel=0,!error_state.msg goto, final endif Header = Header[0:LinesRead-1] file_http_copy_MimeType, Header, class=Class, type=Type, length=Length, date=date, last=last, found=found ; analyze the header modtime = keyword_set(last) ? str2time(last, informat = 'DMYhms') : systime(1) clock_offset = str2time(date,informat = 'DMYhms') - tstart dprint,dlevel=6,'Header= ',transpose(header) dprint,dlevel=5,/phelp,class dprint,dlevel=5,/phelp,type dprint,dlevel=5,/phelp,length dprint,dlevel=5,/phelp,date dprint,dlevel=5,/phelp,last ;; if abs(clock_offset) gt 30 then dprint,dlevel=1,'Warning! Your Clock is off by:',clock_offset,' Seconds dprint,dlevel=4,'localname="'+localname+'"' if found then begin tdiff = (modtime - lcl.mtime) / 24./3600. ; days old MB = 2.^20 if lcl.exists then begin dprint,dlevel=5,'tdiff=',tdiff,' days' if tdiff gt 0 then dprint,dlevel=4, format="('Updating ',f0.1,' day old file: ',a )", tdiff, localname if lcl.size ne length then begin dprint,dlevel=1,length/mb,lcl.size/mb, file_basename(localname), format='("Warning! Different file sizes: Remote=",f0.3," MB, Local=",f0.3," MB file: ",a)' dprint,dlevel=1,purl,format='("Warning! Local file may be corrupted, if so please delete it and try again. File Name: DATADIR",a)' endif endif else begin dprint,dlevel=3,format="('Found new (',f0.3,' MB) file: ""',a,'""')",length/mb,url endelse if lcl.exists eq 0 or tdiff ge 0 then begin ; download file dirname = file_dirname(localname) if file_test(dirname,/dir) eq 0 then begin dprint,dlevel=3,'Creating new directory: "'+dirname+'"' file_mkdir,dirname endif On_IOERROR, file_error2 openw, wunit, localname, /get_lun ts = systime(1) t0 = systime(1) if length eq 0 and class eq 'text' then begin ; Text file download dprint,dlevel=3,'Downloading as a text file.' while eof(unit) EQ 0 do begin readf, unit, text printf, wunit, text endwhile endif else begin ; Non-text (binary) files maxb = 2l^20 ; 1 Megabyte default buffer size nb=0l b=0l while nb lt length do begin buffsize = maxb < (length-nb) aaa = bytarr(buffsize,/nozero) readu, unit, aaa writeu, wunit, aaa nb += buffsize t1 = systime(1) dt = t1-t0 b += buffsize percent = 100.*float(nb)/length if (dt gt 5.) and (nb lt length) then begin ; Wait 5 seconds between updates. rate = b/mb/dt ; This will only display if the filesize (LENGTH) is greater than MAXB eta = (length-nb)/mb/rate +t1 - tstart messstr = string(format='(" ",f5.1," % (",f0.1,"/",f0.1," secs) @ ",f0.2," MB/s")', percent, t1-tstart,eta, rate ,/print) t0 = t1 b =0l dprint,dlevel=1,messstr & wait,.01 if obj_valid(progobj) then begin progobj->update,percent,text=messstr if progobj->checkcancel() then message,'Download cancelled by user',/ioerror endif endif endwhile t1 = systime(1) dt = t1 - tstart messstr = string(/print,format = "('Downloaded ',f0.3,' MBytes in ',f0.1,' secs @ ',f0.2,' MB/s File: ""', a,'""' )",nb/mb,dt,nb/mb/dt,localname ) dprint,dlevel=2,messstr if obj_valid(progobj) then begin progobj->update,percent,text=messstr endif endelse free_lun, wunit if 0 then begin file_error2: beep dprint,dlevel=0,'Error downloading file: "',url,'"' error = !error_state.msg dprint,dlevel=0,error dprint,dlevel=0,'Deleting: "' + lcl.name +'"' if obj_valid(progobj) then begin progobj->update,0.,text=error endif if keyword_set(wunit) then free_lun, wunit file_delete,lcl.name endif endif else dprint,'Found local file: "' + localname + '" (No download needed)' endif else dprint,'Remote file not found! "'+ url + '"' free_lun, unit dprint,dlevel=4,'Closing server: ',server ;; final: if n_elements(verbose) ne 0 then dprint,setdebug=last_dbg ; Reset previous debug level. return END