html agility pack - I have code in C# that looks like this that I need to extract the same fields, but I am uncertain how to do it -
in python, have code looks using libxml
parser = etree.htmlparser() id = 0 nodes = node.findall(r'.//div[@id="flexbox_flex_calendar_maincal"]//table/tr[@class]') x in nodes: if x.attrib['class'].startswith('calendar'): item = getarow(x, id) newsitems.addrow(item) id = id + 1 id in range(0, newsitems.getlength()): rowdict = newsitems.getrow(id) if rowdict not none: rowitems = qstringlist([rowdict['time'], rowdict['currency'], rowdict['impact'], rowdict['event'], rowdict['actual'], rowdict['forecast'], rowdict['previous']] ) #newsitems[rowdict['time']].append(rowitems) newstable.addrow(rowitems)
i have code in c# looks need extract same fields, uncertain how it. whatnodestofind
string problematical.
using system; using system.text; using system.threading; using system.threading.tasks; using system.linq; using system.windows.forms; using htmlagilitypack; namespace consoleapplication276 { // container url , parser action public class link { public string link { get; set; } public action<string> parser { get; set; } } public class program { static string[] monthstrings = new string[] { "", "jan", "feb", "mar", "apr", "may", "jun", "july", "aug", "sep", "oct", "nov", "dec" }; public static string getdateinforexfactoryformat() { var today = system.datetime.now; var daystr = today.day.tostring(); var monthstr = monthstrings[today.month]; var yearstr = today.year.tostring(); return daystr + monthstr + '.' + yearstr; } // entry point of console app public static void main(string[] args) { try { // download each page , dump content // can add more links here, associate each link parser action, data should parser generate create property in link container var task = messageloopworker.run(doworkasync, new link() { link = "http://www.forexfactory.com/calendar.php?day=" + getdateinforexfactoryformat(), parser = (string html) => { //do ever need hap here var doc = new htmlagilitypack.htmldocument(); doc.loadhtml(html); string whatnodestofind = ".//div"; //string whatnodestofind = "table"; //var somenodes = doc.documentnode.selectsinglenode(whatnodestofind); var somenodes = doc.documentnode.selectnodes(whatnodestofind); foreach (var node in somenodes) { console.writeline(node); } } }); task.wait(); console.writeline("doworkasync completed."); } catch (exception ex) { console.writeline("doworkasync failed: " + ex.message); } console.writeline("press enter exit."); console.readline(); } // navigate webbrowser list of urls in loop public static async task<link> doworkasync(link[] args) { console.writeline("start working."); using (var wb = new webbrowser()) { wb.scripterrorssuppressed = true; taskcompletionsource<bool> tcs = null; webbrowserdocumentcompletedeventhandler documentcompletedhandler = (s, e) => tcs.trysetresult(true); // navigate each url in list foreach (var arg in args) { tcs = new taskcompletionsource<bool>(); wb.documentcompleted += documentcompletedhandler; try { wb.navigate(arg.link.tostring()); // await documentcompleted await tcs.task; // after page loads pass html parser arg.parser(wb.documenttext); } { wb.documentcompleted -= documentcompletedhandler; } // dom ready console.writeline(arg.link.tostring()); console.writeline(wb.document.body.outerhtml); } } console.writeline("end working."); return null; } } // helper class start message loop , execute asynchronous task public static class messageloopworker { public static async task<object> run(func<link[], task<link>> worker, params link[] args) { var tcs = new taskcompletionsource<object>(); var thread = new thread(() => { eventhandler idlehandler = null; idlehandler = async (s, e) => { // handle application.idle once application.idle -= idlehandler; // return message loop await task.yield(); // , continue asynchronously // propogate result or exception try { var result = await worker(args); tcs.setresult(result); } catch (exception ex) { tcs.setexception(ex); } // signal exit message loop // application.run exit @ point application.exitthread(); }; // handle application.idle once // make sure we're inside message loop // , synchronizationcontext has been correctly installed application.idle += idlehandler; application.run(); }); // set sta model new thread thread.setapartmentstate(apartmentstate.sta); // start thread , await task thread.start(); try { return await tcs.task; } { thread.join(); } } } }
i tried doesn't work, meaning returns no nodes. yet, can see nodes using google chrome inspect element:
var findclasses = doc.documentnode.descendants("div").where(d => d.attributes.contains("class") && d.attributes["id"].value.contains("flex")); foreach (var d in findclasses) { console.writeline(d); }
regarding edit 1 section, i'd recommend use d.getattributevalue("id", "")
replace d.attributes["id"].value
, because latter throw exception in case current d
element doesn't have attribute id
(and did happen when parsing html page retrieved url in sample) :
var link = "http://www.forexfactory.com/calendar.php?day=aug7.2015"; var doc = new htmlweb().load(link); var findclasses = doc.documentnode .descendants("div") .where(d => d.attributes.contains("class") && d.getattributevalue("id", "").contains("flex") ); foreach (var d in findclasses) { console.writeline("{0}, {1}", d.name, d.getattributevalue("id", "")); }
output :
div, flexbox_flex_minicalendar_ div, flexbox_flex_calendar_maincal div, flexdatepicker_calendar_maincal_begindate div, flexdatepicker_calendar_maincal_enddate
Comments
Post a Comment