kpcyrd/passive-spider
function entry(parent, href)
local psl, parts, url, host
if href == nil then
return
end
url = url_join(parent, href)
if last_err() then return clear_err() end
if url:match('^https?://') == nil then
return
end
parts = url_parse(url)
if last_err() then return end
host = parts['host']
psl = psl_domain_from_dns_name(host)
domain_id = db_select('domain', psl)
if domain_id ~= nil then
db_add('subdomain', {
domain_id=domain_id,
value=host,
})
end
if host == 'wa.me' then
m = regex_find('^/([0-9]+)$', parts['path'])
if m then
db_add('phonenumber', {
value='+' .. m[2],
})
end
end
if host == 'www.instagram.com' then
m = regex_find('^/([^/]+)', parts['path'])
if m then
db_add('account', {
service='instagram.com',
username=m[2],
})
end
end
if host == 'www.facebook.com' then
m = regex_find('^/([^/]+)', parts['path'])
if m then
db_add('account', {
service='facebook.com',
username=m[2],
})
end
end
end
function run(arg)
if arg['body'] == nil or #arg['body'] == 0 then
return
end
body = utf8_decode(arg['body'])
if last_err() then return end
links = html_select_list(body, 'a')
if last_err() then return end
if #links == 0 then
return
end
for i=1, #links do
href = links[i]['attrs']['href']
entry(arg['value'], href)
end
end