149 lines
4.1 KiB
Matlab
149 lines
4.1 KiB
Matlab
function [U,G] = surfer(root,n)
|
|
% UPDATED VERSION
|
|
|
|
% SURFER Create the adjacency graph of a portion of the Web.
|
|
% [U,G] = surfer(root,n) starts at the URL root and follows
|
|
% Web links until it forms an adjacency graph with n nodes.
|
|
% U = a cell array of n strings, the URLs of the nodes.
|
|
% G = an n-by-n sparse matrix with G(i,j)=1 if node j is linked to node i.
|
|
%
|
|
% Example: [U,G] = surfer('https://inf.ethz.ch/',500);
|
|
% See also PAGERANK.
|
|
%
|
|
% This function currently has two defects. (1) The algorithm for
|
|
% finding links is naive. We just look for the string 'http:'.
|
|
% (2) An attempt to read from a URL that is accessible, but very slow,
|
|
% might take an unacceptably long time to complete. In some cases,
|
|
% it may be necessary to have the operating system terminate MATLAB.
|
|
% Key words from such URLs can be added to the skip list in surfer.m.
|
|
|
|
% Initialize
|
|
|
|
clf
|
|
shg
|
|
set(gcf,'doublebuffer','on')
|
|
axis([0 n 0 n])
|
|
axis square
|
|
axis ij
|
|
box on
|
|
set(gca,'position',[.12 .20 .78 .78])
|
|
uicontrol('style','frame','units','normal','position',[.01 .09 .98 .07]);
|
|
uicontrol('style','frame','units','normal','position',[.01 .01 .98 .07]);
|
|
t1 = uicontrol('style','text','units','normal','position',[.02 .10 .94 .04], ...
|
|
'horiz','left');
|
|
t2 = uicontrol('style','text','units','normal','position',[.02 .02 .94 .04], ...
|
|
'horiz','left');
|
|
slow = uicontrol('style','toggle','units','normal', ...
|
|
'position',[.01 .24 .07 .05],'string','slow','value',0);
|
|
quit = uicontrol('style','toggle','units','normal', ...
|
|
'position',[.01 .17 .07 .05],'string','quit','value',0);
|
|
|
|
U = cell(n,1);
|
|
hash = zeros(n,1);
|
|
G = logical(sparse(n,n));
|
|
m = 1;
|
|
U{m} = root;
|
|
hash(m) = hashfun(root);
|
|
|
|
j = 1;
|
|
while j < n && get(quit,'value') == 0
|
|
|
|
% Try to open a page.
|
|
|
|
try
|
|
set(t1,'string',sprintf('%5d %s',j,U{j}))
|
|
set(t2,'string','');
|
|
drawnow
|
|
page = urlread(U{j});
|
|
catch
|
|
set(t1,'string',sprintf('fail: %5d %s',j,U{j}))
|
|
drawnow
|
|
j = j+1;
|
|
continue
|
|
end
|
|
if get(slow,'value')
|
|
pause(.25)
|
|
end
|
|
|
|
% Follow the links from the open page.
|
|
|
|
for f = strfind(page, 'https:')
|
|
|
|
% A link starts with 'http:' and ends with the next quote.
|
|
|
|
e = min([strfind(page(f:end),'"') strfind(page(f:end),'''')]);
|
|
if isempty(e), continue, end
|
|
url = deblank(page(f:f+e-2));
|
|
url(url<' ') = '!'; % Nonprintable characters
|
|
if url(end) == '/', url(end) = []; end
|
|
|
|
% Look for links that should be skipped.
|
|
|
|
skips = {'.gif','.jpg','.jpeg','.pdf','.css','.asp','.mwc','.ram', ...
|
|
'.cgi','lmscadsi','cybernet','w3.org','google','yahoo', ...
|
|
'scripts','netscape','shockwave','webex','fansonly', ...
|
|
'idref.fr', 'purl.org', 'freedomdefined','wernfbox' };
|
|
|
|
skip = any(url=='!') | any(url=='?');
|
|
k = 0;
|
|
while ~skip && (k < length(skips))
|
|
k = k+1;
|
|
skip = ~isempty(strfind(url,skips{k}));
|
|
end
|
|
if skip
|
|
if ~contains(url,'.gif') && ~contains(url,'.jpg')
|
|
set(t2,'string',sprintf('skip: %s',url))
|
|
drawnow
|
|
if get(slow,'value')
|
|
pause(.25)
|
|
end
|
|
end
|
|
continue
|
|
end
|
|
|
|
% Check if page is already in url list.
|
|
|
|
i = 0;
|
|
for k = find(hash(1:m) == hashfun(url))'
|
|
if isequal(U{k},url)
|
|
i = k;
|
|
break
|
|
end
|
|
end
|
|
|
|
% Add a new url to the graph there if are fewer than n.
|
|
|
|
if (i == 0) && (m < n)
|
|
m = m+1;
|
|
U{m} = url;
|
|
hash(m) = hashfun(url);
|
|
i = m;
|
|
end
|
|
|
|
% Add a new link.
|
|
|
|
if i > 0
|
|
G(i,j) = 1;
|
|
set(t2,'string',sprintf('%5d %s',i,url))
|
|
line(j,i,'marker','.','markersize',6)
|
|
drawnow
|
|
if get(slow,'value')
|
|
pause(.5)
|
|
end
|
|
end
|
|
end
|
|
|
|
j = j+1;
|
|
end
|
|
delete(t1)
|
|
delete(t2)
|
|
delete(slow)
|
|
set(quit,'string','close','callback','close(gcf)','value',0)
|
|
|
|
|
|
|
|
%------------------------
|
|
|
|
function h = hashfun(url)
|
|
% Almost unique numeric hash code for pages already visited.
|
|
h = length(url) + 1024*sum(url);
|