mirror of
https://github.com/ultimatepp/ultimatepp.git
synced 2026-05-16 14:16:09 -06:00
156 lines
3.5 KiB
C++
156 lines
3.5 KiB
C++
#include <CtrlLib/CtrlLib.h>
|
|
#include <Core/SSL/SSL.h>
|
|
|
|
using namespace Upp;
|
|
|
|
#define LAYOUTFILE <GuiWebCrawler/GuiWebCrawler.lay>
|
|
#include <CtrlCore/lay.h>
|
|
|
|
struct WebCrawler : public WithCrawlerLayout<TopWindow> {
|
|
BiVector<int> todo;
|
|
VectorMap<String, int> url;
|
|
|
|
struct Work {
|
|
HttpRequest http;
|
|
int urli;
|
|
};
|
|
Array<Work> http;
|
|
int64 total;
|
|
|
|
void ExtractUrls(const String& html, int srci);
|
|
void ShowPath();
|
|
void OpenURL(ArrayCtrl& a);
|
|
|
|
public:
|
|
void Run();
|
|
|
|
WebCrawler();
|
|
};
|
|
|
|
bool IsUrlChar(int c)
|
|
{
|
|
return c == ':' || c == '.' || IsAlNum(c) || c == '_' || c == '%' || c == '/';
|
|
}
|
|
|
|
void WebCrawler::ExtractUrls(const String& html, int srci)
|
|
{
|
|
int q = 0;
|
|
while(q < html.GetCount()) {
|
|
int http = html.Find("http://", q);
|
|
int https = html.Find("https://", q);
|
|
q = min(http < 0 ? https : http, https < 0 ? http : https);
|
|
if(q < 0)
|
|
return;
|
|
int b = q;
|
|
while(q < html.GetCount() && IsUrlChar(html[q]))
|
|
q++;
|
|
String u = html.Mid(b, q - b);
|
|
if(url.Find(u) < 0) {
|
|
todo.AddTail(url.GetCount());
|
|
url.Add(u, srci);
|
|
}
|
|
}
|
|
}
|
|
|
|
void WebCrawler::Run()
|
|
{
|
|
String seed = "www.ultimatepp.org";
|
|
if(!EditText(seed, "GuiWebSpider", "Seed URL"))
|
|
return;
|
|
todo.AddTail(0);
|
|
url.Add(seed);
|
|
Open();
|
|
while(IsOpen()) {
|
|
ProcessEvents();
|
|
while(todo.GetCount() && http.GetCount() < 60) {
|
|
int i = todo.Head();
|
|
todo.DropHead();
|
|
Work& w = http.Add();
|
|
w.urli = i;
|
|
w.http.Url(url.GetKey(i))
|
|
.UserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:11.0) Gecko/20100101 Firefox/11.0")
|
|
.Timeout(0);
|
|
work.Add(url.GetKey(i));
|
|
work.HeaderTab(0).SetText(Format("URL (%d)", work.GetCount()));
|
|
}
|
|
SocketWaitEvent we;
|
|
for(int i = 0; i < http.GetCount(); i++)
|
|
we.Add(http[i].http, http[i].http.GetWaitEvents());
|
|
we.Wait(10);
|
|
int i = 0;
|
|
while(i < http.GetCount()) {
|
|
Work& w = http[i];
|
|
w.http.Do();
|
|
String u = url.GetKey(w.urli);
|
|
int q = work.Find(u);
|
|
if(w.http.InProgress()) {
|
|
if(q >= 0)
|
|
work.Set(q, 1, w.http.GetPhaseName());
|
|
i++;
|
|
}
|
|
else {
|
|
String html = w.http;
|
|
total += html.GetCount();
|
|
finished.Add(u, w.http.IsError() ? String().Cat() << w.http.GetErrorDesc()
|
|
: String().Cat() << w.http.GetStatusCode()
|
|
<< ' ' << w.http.GetReasonPhrase()
|
|
<< " (" << html.GetCount() << " bytes)",
|
|
w.urli);
|
|
finished.HeaderTab(0).SetText(Format("Finished (%d)", finished.GetCount()));
|
|
finished.HeaderTab(1).SetText(Format("Response (%` KB)", total >> 10));
|
|
if(w.http.IsSuccess()) {
|
|
ExtractUrls(html, w.urli);
|
|
Title(AsString(url.GetCount()) + " URLs found");
|
|
}
|
|
http.Remove(i);
|
|
work.Remove(q);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void WebCrawler::ShowPath()
|
|
{
|
|
path.Clear();
|
|
if(!finished.IsCursor())
|
|
return;
|
|
int i = finished.Get(2);
|
|
Vector<String> p;
|
|
for(;;) {
|
|
p.Add(url.GetKey(i));
|
|
if(i == 0)
|
|
break;
|
|
i = url[i];
|
|
}
|
|
for(int i = p.GetCount() - 1; i >= 0; i--)
|
|
path.Add(p[i]);
|
|
}
|
|
|
|
void WebCrawler::OpenURL(ArrayCtrl& a)
|
|
{
|
|
String u = a.GetKey();
|
|
WriteClipboardText(u);
|
|
LaunchWebBrowser(u);
|
|
}
|
|
|
|
WebCrawler::WebCrawler()
|
|
{
|
|
CtrlLayout(*this, "WebCrawler");
|
|
work.AddColumn("URL");
|
|
work.AddColumn("Status");
|
|
finished.AddColumn("Finished");
|
|
finished.AddColumn("Response");
|
|
finished.WhenCursor = [=] { ShowPath(); };
|
|
finished.WhenLeftDouble = [=] { OpenURL(finished); };
|
|
path.AddColumn("Path");
|
|
path.WhenLeftDouble = [=] { OpenURL(path); };
|
|
total = 0;
|
|
Zoomable().Sizeable();
|
|
}
|
|
|
|
GUI_APP_MAIN
|
|
{
|
|
HttpRequest::Trace();
|
|
|
|
WebCrawler().Run();
|
|
}
|