; ---- Scroll down for main function ------- ;Function: ; file_http_strip_domain ;Purpose: ; removes the domain(http://domain.whatever/) from html link, if present. Otherwise, returns string unmodified ;Inputs: ; s: The string to have domain removed ;Returns: ; s: with domain removed function spd_download_strip_domain,s compile_opt idl2,hidden ;match a string containing the following in order ;#1 the beginning of the string ;#2 "http://" ;#2 followed by one or more characters that are not "/" ;#3 followed by one "/" ;#4 followed by 0 or more characters of any type m = stregex(s,"^(ftp)|(http)s?://[^/]+/",length=l,/fold_case) if m[0] ne -1 then begin return, strmid(s,l) endif else begin return, s endelse end ;Function: ; file_http_is_parent_dir ;Purpose: ; predicate function, checks whether the provided link is a parent to the current directory ;Inputs: ; Current: Set to the full url for the current directory ; Link: The link to be checked ;Returns: ; 1: if link is to current's parent ; 0; if link is not to current's parent function spd_download_is_parent_dir, current, link compile_opt idl2,hidden if n_elements(link) eq 0 then return,0 if strlen(link) eq 0 then return,0 ;match a string containing the following in order ;#1 the contents of the variable "link" ;#2 one or more characters that are not "/" ;#3 the "/" character ;#4 the end of the string ;Other notes: ;#1 link will always end in "/" if it is a directory. So there is no need to specify it in the regex ;#2 strip domain will always return a string that does not begin with a "/" (relative link), so we add it back in return,stregex("/"+spd_download_strip_domain(current),escape_string(link)+"[^/]+/$",/boolean,/fold_case) end ;+ ;Function: ; spd_download_extract ; ;Purpose: ; Helper function to parse (link) tags from html index files. ; ;Calling Sequence: ; return_value = spd_download_extract(string [,/relative] [,/normal] ; [,no_parent_links=no_parent_links]) ; ;Input: ; string_array: String array containing the html index file to be parsed ; relative: Set to strip out everything but the filename from a link ; normal: Set to links that don't have '*' or '?' ; (don't think this should every actually happen, but option retained just in case) ; no_parent_links: Set to the parent domain to automatically exclude backlinks to the parent directory ; ;Output: ; return_value: An empty string or array of strings with link destinations ; ;Notes: ; Copied from file_http_copy subroutine extract_html_links_regex, original notes below: ; ; "The _regex version of this routine is replacing the original version because ; the old version made assumptions about the formatting of the .remote-index.html file ; that were dependent upon the type of web server that was producing the file. We think that ; these bugs took so long to show up because Apache servers are extremely common. ; Modification prompted so that file_http_copy can work more reliably rbspice & rb-emfisis ; New version: ; #1 Handles html that doesn't place the href attribute exactly one space after the link tag ; #2 Handles cases where the server doesn't include newlines, or where multiple links are ; included per line of returned html by the server" ; ; ;$LastChangedBy: aaflores $ ;$LastChangedDate: 2015-02-18 16:27:58 -0800 (Wed, 18 Feb 2015) $ ;$LastChangedRevision: 17004 $ ;$URL: svn+ssh://thmsvn@ambrosia.ssl.berkeley.edu/repos/spdsoft/tags/spedas_3_1/general/spedas_tools/spd_download/spd_download_extract.pro $ ; ;- function spd_download_extract, string_array, $ relative=relative, $ normal=normal, $ no_parent_links=no_parent_links compile_opt idl2,hidden links = '' ;This regex is a little tricky, most of the complexity is to prevent it from matching ;two links when it should match one. ; ; e.g. It could match ; instead of ; (matching between the first , rather than first & first) if keyword_set(normal) then begin ;match a string containing the following in order ;#1 "' ;#3 "href=" ;#4 '"' (quotation mark) ;#5 0 or more characters that are not '"' '*' or '?' ;#6 '"' (quotation mark) ;#7 0 or more characters that are not '<' or '>' ;#8 The '>' character ; ;Other notes: ;#1 The () are not a part of the pattern. They indicate that anything matching inside the parentheses is a captured sub-expression link_finder_regex='^<]*href="([^"^*^?]*)"[^<^>]*>' endif else begin ;match a string containing the following in order ;#1 "' ;#3 "href=" ;#4 '"' (quotation mark) ;#5 0 or more characters that are not '"' ;#6 '"' (quotation mark) ;#7 0 or more characters that are not '<' or '>' ;#8 The '>' character ; ;Other notes: ;#1 The () are not a part of the pattern. They indicate that anything matching inside the parentheses is a captured sub-expression link_finder_regex='^<]*href="([^"]*)"[^<^>]*>' endelse ;perform search one line at a time for i=0, n_elements(string_array)-1 do begin string = string_array[i] ;/subexp indicates that everything inside the () of the regex should be returned in the results so that they can be extracted pos = stregex(string,link_finder_regex,/subexp,length=length,/fold_case) while pos[1] ne -1 do begin link = strmid(string,pos[1],length[1]) ; remove a copy of the link from the string string = strmid(string,pos[0]+length[0]) ; remove link from string, so that we can process the next string ;exclude parent links, if keyword set and domain provided if n_elements(no_parent_links) gt 0 then begin if spd_download_is_parent_dir(no_parent_links,link) then begin link = '' endif endif if keyword_set(relative) then begin ;match a string containing the following in order ;#1 a "/" ;#2 one or more characters that are not "/" ;#3 one or more "/" characters ;#4 the end of the string rel_pos = stregex(link,'/[^/]+/?$',/fold_case) if rel_pos[0] ne -1 then begin link = strmid(link,rel_pos+1) endif endif if strlen(link) gt 0 then begin if strlen(links[0]) gt 0 then begin links = [links,link] endif else begin links = [link] endelse endif pos = stregex(string,link_finder_regex,/subexp,length=length,/fold_case) endwhile endfor return, links end