; ---- Scroll down for main function -------
;Function:
; file_http_strip_domain
;Purpose:
; removes the domain(http://domain.whatever/) from html link, if present. Otherwise, returns string unmodified
;Inputs:
; s: The string to have domain removed
;Returns:
; s: with domain removed
function spd_download_strip_domain,s
compile_opt idl2,hidden
;match a string containing the following in order
;#1 the beginning of the string
;#2 "http://"
;#2 followed by one or more characters that are not "/"
;#3 followed by one "/"
;#4 followed by 0 or more characters of any type
m = stregex(s,"^(ftp)|(http)s?://[^/]+/",length=l,/fold_case)
if m[0] ne -1 then begin
return, strmid(s,l)
endif else begin
return, s
endelse
end
;Function:
; file_http_is_parent_dir
;Purpose:
; predicate function, checks whether the provided link is a parent to the current directory
;Inputs:
; Current: Set to the full url for the current directory
; Link: The link to be checked
;Returns:
; 1: if link is to current's parent
; 0; if link is not to current's parent
function spd_download_is_parent_dir, current, link
compile_opt idl2,hidden
if n_elements(link) eq 0 then return,0
if strlen(link) eq 0 then return,0
;match a string containing the following in order
;#1 the contents of the variable "link"
;#2 one or more characters that are not "/"
;#3 the "/" character
;#4 the end of the string
;Other notes:
;#1 link will always end in "/" if it is a directory. So there is no need to specify it in the regex
;#2 strip domain will always return a string that does not begin with a "/" (relative link), so we add it back in
return,stregex("/"+spd_download_strip_domain(current),escape_string(link)+"[^/]+/$",/boolean,/fold_case)
end
;+
;Function:
; spd_download_extract
;
;Purpose:
; Helper function to parse (link) tags from html index files.
;
;Calling Sequence:
; return_value = spd_download_extract(string [,/relative] [,/normal]
; [,no_parent_links=no_parent_links])
;
;Input:
; string_array: String array containing the html index file to be parsed
; relative: Set to strip out everything but the filename from a link
; normal: Set to links that don't have '*' or '?'
; (don't think this should every actually happen, but option retained just in case)
; no_parent_links: Set to the parent domain to automatically exclude backlinks to the parent directory
;
;Output:
; return_value: An empty string or array of strings with link destinations
;
;Notes:
; Copied from file_http_copy subroutine extract_html_links_regex, original notes below:
;
; "The _regex version of this routine is replacing the original version because
; the old version made assumptions about the formatting of the .remote-index.html file
; that were dependent upon the type of web server that was producing the file. We think that
; these bugs took so long to show up because Apache servers are extremely common.
; Modification prompted so that file_http_copy can work more reliably rbspice & rb-emfisis
; New version:
; #1 Handles html that doesn't place the href attribute exactly one space after the link tag
; #2 Handles cases where the server doesn't include newlines, or where multiple links are
; included per line of returned html by the server"
;
;
;$LastChangedBy: aaflores $
;$LastChangedDate: 2015-02-18 16:27:58 -0800 (Wed, 18 Feb 2015) $
;$LastChangedRevision: 17004 $
;$URL: svn+ssh://thmsvn@ambrosia.ssl.berkeley.edu/repos/spdsoft/tags/spedas_3_3/general/spedas_tools/spd_download/spd_download_extract.pro $
;
;-
function spd_download_extract, string_array, $
relative=relative, $
normal=normal, $
no_parent_links=no_parent_links
compile_opt idl2,hidden
links = ''
;This regex is a little tricky, most of the complexity is to prevent it from matching
;two links when it should match one.
;
; e.g. It could match
; instead of
; (matching between the first , rather than first & first)
if keyword_set(normal) then begin
;match a string containing the following in order
;#1 "'
;#3 "href="
;#4 '"' (quotation mark)
;#5 0 or more characters that are not '"' '*' or '?'
;#6 '"' (quotation mark)
;#7 0 or more characters that are not '<' or '>'
;#8 The '>' character
;
;Other notes:
;#1 The () are not a part of the pattern. They indicate that anything matching inside the parentheses is a captured sub-expression
link_finder_regex='^<]*href="([^"^*^?]*)"[^<^>]*>'
endif else begin
;match a string containing the following in order
;#1 "'
;#3 "href="
;#4 '"' (quotation mark)
;#5 0 or more characters that are not '"'
;#6 '"' (quotation mark)
;#7 0 or more characters that are not '<' or '>'
;#8 The '>' character
;
;Other notes:
;#1 The () are not a part of the pattern. They indicate that anything matching inside the parentheses is a captured sub-expression
link_finder_regex='^<]*href="([^"]*)"[^<^>]*>'
endelse
;perform search one line at a time
for i=0, n_elements(string_array)-1 do begin
string = string_array[i]
;/subexp indicates that everything inside the () of the regex should be returned in the results so that they can be extracted
pos = stregex(string,link_finder_regex,/subexp,length=length,/fold_case)
while pos[1] ne -1 do begin
link = strmid(string,pos[1],length[1]) ; remove a copy of the link from the string
string = strmid(string,pos[0]+length[0]) ; remove link from string, so that we can process the next string
;exclude parent links, if keyword set and domain provided
if n_elements(no_parent_links) gt 0 then begin
if spd_download_is_parent_dir(no_parent_links,link) then begin
link = ''
endif
endif
if keyword_set(relative) then begin
;match a string containing the following in order
;#1 a "/"
;#2 one or more characters that are not "/"
;#3 one or more "/" characters
;#4 the end of the string
rel_pos = stregex(link,'/[^/]+/?$',/fold_case)
if rel_pos[0] ne -1 then begin
link = strmid(link,rel_pos+1)
endif
endif
if strlen(link) gt 0 then begin
if strlen(links[0]) gt 0 then begin
links = [links,link]
endif else begin
links = [link]
endelse
endif
pos = stregex(string,link_finder_regex,/subexp,length=length,/fold_case)
endwhile
endfor
return, links
end