问题描述
我用这方法来实例化一个网页浏览器编程,导航到一个URL,当该文件已完成返回结果。
我如何将能够停止工作
并 GetFinalUrl()
返回空
如果文档需要超过5秒钟加载?
我已经看到了使用许多例子 TaskFactory
,但我一直无法将它应用到这个code。
私人乌里GetFinalUrl(PortalMerchant portalMerchant)
{
SetBrowserFeatureControl();
乌里finalUri = NULL;
如果(string.IsNullOrEmpty(portalMerchant.Url))
{
返回null;
}
乌里trackingUrl =新的URI(portalMerchant.Url);
VAR任务= MessageLoopWorker.Run(DoWorkAsync,trackingUrl);
task.Wait();
如果(!String.IsNullOrEmpty(task.Result.ToString()))
{
返回新的URI(task.Result.ToString());
}
其他
{
抛出新的异常(分析失败);
}
}
//通过Noseratio - http://stackoverflow.com/users/1768303/noseratio
静态异步任务<对象> DoWorkAsync(对象[]参数)
{
_threadCount ++;
Console.WriteLine(线程数:+ _threadCount);
乌里retVal的= NULL;
变种WB =新的web浏览器();
wb.ScriptErrorsSup pressed = TRUE;
TaskCompletionSource<布尔> TCS = NULL;
WebBrowserDocumentCompletedEventHandler documentCompletedHandler =(S,E)=> tcs.TrySetResult(真正的);
的foreach(在args VAR URL)
{
TCS =新TaskCompletionSource<布尔>();
wb.DocumentCompleted + = documentCompletedHandler;
尝试
{
wb.Navigate(url.ToString());
等待tcs.Task;
}
最后
{
wb.DocumentCompleted - = documentCompletedHandler;
}
retVal的= wb.Url;
wb.Dispose();
返回retVal的;
}
返回null;
}
公共静态类MessageLoopWorker
{
#地区的公共静态方法
公共静态异步任务<对象>润(Func键<对象[],任务<对象>>工人,params对象[]参数)
{
VAR TCS =新TaskCompletionSource<对象>();
VAR线程=新主题(()=>
{
事件处理程序idleHandler = NULL;
idleHandler =异步(S,E)=>
{
//处理Application.Idle只有一次
Application.Idle - = idleHandler;
//返回消息循环
等待Task.Yield();
//并继续异步
// propogate结果或异常
尝试
{
VAR的结果=等待工作者(参数);
tcs.SetResult(结果);
}
赶上(例外前)
{
tcs.SetException(前);
}
//信号退出消息循环
// Application.Run将退出在该点
Application.ExitThread();
};
//处理Application.Idle只有一次
//确保我们的消息循环中
//和的SynchronizationContext已经正确安装
Application.Idle + = idleHandler;
Application.Run();
});
//设置STA模式,为新的线程
thread.SetApartmentState(ApartmentState.STA);
//启动线程并等待的任务
thread.Start();
尝试
{
返回等待tcs.Task;
}
最后
{
的Thread.join();
}
}
#endregion
}
更新:在 web浏览器的最新版本
基于控制台的网络刮板可以在Github上发现。
更新:添加 web浏览器的池
对象多个并行下载。
下面是或多或少通用的 web浏览器
基于网络刮板,它可以作为控制台应用程序的实现。这是一个整合了一些我的previous web浏览器
- 相关工作,包括在问题中引用的code:
的几点:
-
可重复使用的 MessageLoopApartment 类是用来启动和运行一个WinForms STA线程有自己的消息泵。它可以从一个的控制台应用程序中使用的,如下图所示。这个类暴露了一个TPL任务计划(
FromCurrentSynchronizationContext
)和一组Task.Factory.StartNew
包装使用此任务调度。 -
这使得
异步/计谋
的上单独STA线程运行web浏览器
导航任务,一个伟大的工具。这样一来,一个web浏览器
对象被创建,导航的和销毁该线程。虽然,MessageLoopApartment
未捆绑到web浏览器
具体。 -
重要的是使用浏览器的功能,使HTML5渲染控制,否则的
web浏览器
obejcts在IE7仿真模式在默认情况下运行。这就是SetFeatureBrowserEmulation
不低于。 -
有可能不总是可能的,以确定何时一个网页已经完成了与100%的概率渲染。有些是相当复杂的,使用连续AJAX更新。然而,我们可以得到相当接近,通过处理
DocumentCompleted
事件,再投票页面当前HTML快照更改检查WebBrowser.IsBusy
属性。这就是NavigateAsync
不低于。 -
一个超时逻辑是对上述的顶部present,万一页面渲染是没有止境的(注
CancellationTokenSource
和CreateLinkedTokenSource
)。
使用的Microsoft.Win32;使用系统;使用的System.Threading;使用System.Threading.Tasks;使用System.Windows.Forms的;命名空间Console_22239357{ 类节目 { //通过Noseratio - http://stackoverflow.com/a/22262976/1768303 //主逻辑 静态异步任务ScrapSitesAsync(字符串[]的网址,的CancellationToken令牌) { 使用(VAR公寓=新MessageLoopApartment()) { //创建web浏览器里面MessageLoopApartment VAR web浏览器= apartment.Invoke(()=>新建web浏览器()); 尝试 { 的foreach(URL中VAR URL) { Console.WriteLine(网址:\ N+网址); //取消30多岁或当主令牌信号 VAR navigationCts = CancellationTokenSource.CreateLinkedTokenSource(标记); navigationCts.CancelAfter((int)的TimeSpan.FromSeconds(30).TotalMilliseconds); VAR navigationToken = navigationCts.Token; //内运行MessageLoopApartment导航任务 字符串的HTML =等待apartment.Run(()=> webBrowser.NavigateAsync(URL,navigationToken),navigationToken); Console.WriteLine(HTML:\ N+ HTML); } } 最后 { //处理web浏览器的内部MessageLoopApartment apartment.Invoke(()=> webBrowser.Dispose()); } } } // 入口点 静态无效的主要(字串[] args) { 尝试 { WebBrowserExt.SetFeatureBrowserEmulation(); //启用HTML5 VAR CTS =新CancellationTokenSource((INT)TimeSpan.FromMinutes(3).TotalMilliseconds); VAR任务= ScrapSitesAsync( 新的[] {http://example.com,http://example.org,http://example.net}, cts.Token); task.Wait(); Console.WriteLine(preSS Enter键退出...); 到Console.ReadLine(); } 赶上(例外前) { 而(前为AggregateException和放大器;&安培;!ex.InnerException = NULL) EX = ex.InnerException; Console.WriteLine(ex.Message); Environment.Exit(-1); } } } ///<总结> /// WebBrowserExt - web浏览器的扩展 ///通过Noseratio - http://stackoverflow.com/a/22262976/1768303 ///< /总结> 公共静态类WebBrowserExt { const int的POLL_DELAY = 500; //浏览和下载 公共静态异步任务<字符串> NavigateAsync(此web浏览器的浏览器,字符串的URL,的CancellationToken令牌) { //浏览伺机DocumentCompleted VAR TCS =新TaskCompletionSource<布尔>(); WebBrowserDocumentCompletedEventHandler处理器=(S,ARG)=> tcs.TrySetResult(真正的); 使用(token.Register(()=> tcs.TrySetCanceled(),useSynchronizationContext:真)) { webBrowser.DocumentCompleted + =处理程序; 尝试 { webBrowser.Navigate(URL); 等待tcs.Task; //等待DocumentCompleted } 最后 { webBrowser.DocumentCompleted - =处理程序; } } //获取根元素 变种documentElement = webBrowser.Document.GetElementsByTagName(HTML)[0]; //查询当前的HTML的变化asynchronosly VAR HTML = documentElement.OuterHtml; 而(真) { //等待异步,这将抛出如果取消要求 等待Task.Delay(POLL_DELAY,令牌); //继续扫描,如果web浏览器仍然是忙 如果(webBrowser.IsBusy) 继续; VAR htmlNow = documentElement.OuterHtml; 如果(HTML == htmlNow) 打破; //未检测到变化,最终投票循环 HTML = htmlNow; } //考虑完全呈现的页面 token.ThrowIfCancellationRequested(); 返回HTML; } //启用HTML5(假设我们正在运行的IE10 +) //更多信息:http://stackoverflow.com/a/18333982/1768303 公共静态无效SetFeatureBrowserEmulation() { 如果(System.ComponentModel.LicenseManager.UsageMode!= System.ComponentModel.LicenseUsageMode.Runtime) 返回; 变种的appName = System.IO.Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess()MainModule.FileName。); Registry.SetValue(@HKEY_CURRENT_USER \软件\微软\的Internet Explorer \ MAIN \ FeatureControl \ FEATURE_BROWSER_EMULATION 的appName,10000,RegistryValueKind.DWord); } } ///<总结> /// MessageLoopApartment /// STA线程与任务串行执行的消息泵 ///通过Noseratio - http://stackoverflow.com/a/22262976/1768303 ///< /总结> 公共类MessageLoopApartment:IDisposable的 { 螺纹_Thread; // STA线程 的TaskScheduler _taskScheduler; // STA线程的任务调度 公众的TaskScheduler的TaskScheduler {{返回_taskScheduler; }} ///<总结> MessageLoopApartment构造< /总结> 公共MessageLoopApartment() { VAR TCS =新TaskCompletionSource<的TaskScheduler>(); //启动一个STA线程,并得到一个任务调度 _Thread =新主题(startArg => { 事件处理程序idleHandler = NULL; idleHandler =(S,E)=> { //处理Application.Idle只有一次 Application.Idle - = idleHandler; //返回任务调度程序 tcs.SetResult(TaskScheduler.FromCurrentSynchronizationContext()); }; //处理Application.Idle只有一次 //确保我们的消息循环中 //和的SynchronizationContext已经正确安装 Application.Idle + = idleHandler; Application.Run(); }); _thread.SetApartmentState(ApartmentState.STA); _thread.IsBackground = TRUE; _thread.Start(); _taskScheduler = tcs.Task.Result; } ///<总结>关机STA线程< /总结> 公共无效的Dispose() { 如果(_taskScheduler!= NULL) { VAR的TaskScheduler = _taskScheduler; _taskScheduler = NULL; // STA线程上执行Application.ExitThread() Task.Factory.StartNew( ()=> Application.ExitThread(), CancellationToken.None, TaskCreationOptions.None, 的TaskScheduler).Wait(); _thread.Join(); _Thread = NULL; } } ///<总结> Task.Factory.StartNew包装< /总结> 公共无效调用(动作的动作) { Task.Factory.StartNew(动作, CancellationToken.None,TaskCreationOptions.None,_taskScheduler).Wait(); } 公共TResult调用< TResult>(Func键< TResult>动作) { 返回Task.Factory.StartNew(动作, CancellationToken.None,TaskCreationOptions.None,_taskScheduler)。结果; } 公共任务运行(操作动作,的CancellationToken令牌) { 返回Task.Factory.StartNew(动作,令牌,TaskCreationOptions.None,_taskScheduler); } 公共任务< TResult>润LT; TResult>(Func键< TResult>行动的CancellationToken令牌) { 返回Task.Factory.StartNew(动作,令牌,TaskCreationOptions.None,_taskScheduler); } 公共任务运行(Func键<任务>行动的CancellationToken令牌) { 返回Task.Factory.StartNew(动作,令牌,TaskCreationOptions.None,_taskScheduler).Unwrap(); } 公共任务< TResult>润LT; TResult>(Func键<任务< TResult>>行动的CancellationToken令牌) { 返回Task.Factory.StartNew(动作,令牌,TaskCreationOptions.None,_taskScheduler).Unwrap(); } }}
I am using this method to instantiate a web browser programmatically, navigate to a url and return a result when the document has completed.
How would I be able to stop the Task
and have GetFinalUrl()
return null
if the document takes more than 5 seconds to load?
I have seen many examples using a TaskFactory
but I haven't been able to apply it to this code.
private Uri GetFinalUrl(PortalMerchant portalMerchant)
{
SetBrowserFeatureControl();
Uri finalUri = null;
if (string.IsNullOrEmpty(portalMerchant.Url))
{
return null;
}
Uri trackingUrl = new Uri(portalMerchant.Url);
var task = MessageLoopWorker.Run(DoWorkAsync, trackingUrl);
task.Wait();
if (!String.IsNullOrEmpty(task.Result.ToString()))
{
return new Uri(task.Result.ToString());
}
else
{
throw new Exception("Parsing Failed");
}
}
// by Noseratio - http://stackoverflow.com/users/1768303/noseratio
static async Task<object> DoWorkAsync(object[] args)
{
_threadCount++;
Console.WriteLine("Thread count:" + _threadCount);
Uri retVal = null;
var wb = new WebBrowser();
wb.ScriptErrorsSuppressed = true;
TaskCompletionSource<bool> tcs = null;
WebBrowserDocumentCompletedEventHandler documentCompletedHandler = (s, e) => tcs.TrySetResult(true);
foreach (var url in args)
{
tcs = new TaskCompletionSource<bool>();
wb.DocumentCompleted += documentCompletedHandler;
try
{
wb.Navigate(url.ToString());
await tcs.Task;
}
finally
{
wb.DocumentCompleted -= documentCompletedHandler;
}
retVal = wb.Url;
wb.Dispose();
return retVal;
}
return null;
}
public static class MessageLoopWorker
{
#region Public static methods
public static async Task<object> Run(Func<object[], Task<object>> worker, params object[] args)
{
var tcs = new TaskCompletionSource<object>();
var thread = new Thread(() =>
{
EventHandler idleHandler = null;
idleHandler = async (s, e) =>
{
// handle Application.Idle just once
Application.Idle -= idleHandler;
// return to the message loop
await Task.Yield();
// and continue asynchronously
// propogate the result or exception
try
{
var result = await worker(args);
tcs.SetResult(result);
}
catch (Exception ex)
{
tcs.SetException(ex);
}
// signal to exit the message loop
// Application.Run will exit at this point
Application.ExitThread();
};
// handle Application.Idle just once
// to make sure we're inside the message loop
// and SynchronizationContext has been correctly installed
Application.Idle += idleHandler;
Application.Run();
});
// set STA model for the new thread
thread.SetApartmentState(ApartmentState.STA);
// start the thread and await for the task
thread.Start();
try
{
return await tcs.Task;
}
finally
{
thread.Join();
}
}
#endregion
}
Updated: the latest version of the WebBrowser
-based console web scrapper can be found on Github.
Updated: Adding a pool of WebBrowser
objects for multiple parallel downloads.
Below is an implementation of more or less generic WebBrowser
-based web scrapper, which works as console application. It's a consolidation of some of my previous WebBrowser
-related efforts, including the code referenced in the question:
A few points:
Reusable
MessageLoopApartment
class is used to start and run a WinForms STA thread with its own message pump. It can be used from a console application, as below. This class exposes a TPL Task Scheduler (FromCurrentSynchronizationContext
) and a set ofTask.Factory.StartNew
wrappers to use this task scheduler.This makes
async/await
a great tool for runningWebBrowser
navigation tasks on that separate STA thread. This way, aWebBrowser
object gets created, navigated and destroyed on that thread. Although,MessageLoopApartment
is not tied up toWebBrowser
specifically.It's important to enable HTML5 rendering using Browser FeatureControl, as otherwise the
WebBrowser
obejcts runs in IE7 emulation mode by default.That's whatSetFeatureBrowserEmulation
does below.It may not always be possible to determine when a web page has finished rendering with 100% probability. Some pages are quite complex and use continuous AJAX updates. Yet wecan get quite close, by handling
DocumentCompleted
event first, then polling the page's current HTML snapshot for changes and checking theWebBrowser.IsBusy
property. That's whatNavigateAsync
does below.A time-out logic is present on top of the above, in case the page rendering is never-ending (note
CancellationTokenSource
andCreateLinkedTokenSource
).
using Microsoft.Win32;
using System;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace Console_22239357
{
class Program
{
// by Noseratio - http://stackoverflow.com/a/22262976/1768303
// main logic
static async Task ScrapSitesAsync(string[] urls, CancellationToken token)
{
using (var apartment = new MessageLoopApartment())
{
// create WebBrowser inside MessageLoopApartment
var webBrowser = apartment.Invoke(() => new WebBrowser());
try
{
foreach (var url in urls)
{
Console.WriteLine("URL:\n" + url);
// cancel in 30s or when the main token is signalled
var navigationCts = CancellationTokenSource.CreateLinkedTokenSource(token);
navigationCts.CancelAfter((int)TimeSpan.FromSeconds(30).TotalMilliseconds);
var navigationToken = navigationCts.Token;
// run the navigation task inside MessageLoopApartment
string html = await apartment.Run(() =>
webBrowser.NavigateAsync(url, navigationToken), navigationToken);
Console.WriteLine("HTML:\n" + html);
}
}
finally
{
// dispose of WebBrowser inside MessageLoopApartment
apartment.Invoke(() => webBrowser.Dispose());
}
}
}
// entry point
static void Main(string[] args)
{
try
{
WebBrowserExt.SetFeatureBrowserEmulation(); // enable HTML5
var cts = new CancellationTokenSource((int)TimeSpan.FromMinutes(3).TotalMilliseconds);
var task = ScrapSitesAsync(
new[] { "http://example.com", "http://example.org", "http://example.net" },
cts.Token);
task.Wait();
Console.WriteLine("Press Enter to exit...");
Console.ReadLine();
}
catch (Exception ex)
{
while (ex is AggregateException && ex.InnerException != null)
ex = ex.InnerException;
Console.WriteLine(ex.Message);
Environment.Exit(-1);
}
}
}
/// <summary>
/// WebBrowserExt - WebBrowser extensions
/// by Noseratio - http://stackoverflow.com/a/22262976/1768303
/// </summary>
public static class WebBrowserExt
{
const int POLL_DELAY = 500;
// navigate and download
public static async Task<string> NavigateAsync(this WebBrowser webBrowser, string url, CancellationToken token)
{
// navigate and await DocumentCompleted
var tcs = new TaskCompletionSource<bool>();
WebBrowserDocumentCompletedEventHandler handler = (s, arg) =>
tcs.TrySetResult(true);
using (token.Register(() => tcs.TrySetCanceled(), useSynchronizationContext: true))
{
webBrowser.DocumentCompleted += handler;
try
{
webBrowser.Navigate(url);
await tcs.Task; // wait for DocumentCompleted
}
finally
{
webBrowser.DocumentCompleted -= handler;
}
}
// get the root element
var documentElement = webBrowser.Document.GetElementsByTagName("html")[0];
// poll the current HTML for changes asynchronosly
var html = documentElement.OuterHtml;
while (true)
{
// wait asynchronously, this will throw if cancellation requested
await Task.Delay(POLL_DELAY, token);
// continue polling if the WebBrowser is still busy
if (webBrowser.IsBusy)
continue;
var htmlNow = documentElement.OuterHtml;
if (html == htmlNow)
break; // no changes detected, end the poll loop
html = htmlNow;
}
// consider the page fully rendered
token.ThrowIfCancellationRequested();
return html;
}
// enable HTML5 (assuming we're running IE10+)
// more info: http://stackoverflow.com/a/18333982/1768303
public static void SetFeatureBrowserEmulation()
{
if (System.ComponentModel.LicenseManager.UsageMode != System.ComponentModel.LicenseUsageMode.Runtime)
return;
var appName = System.IO.Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName);
Registry.SetValue(@"HKEY_CURRENT_USER\Software\Microsoft\Internet Explorer\Main\FeatureControl\FEATURE_BROWSER_EMULATION",
appName, 10000, RegistryValueKind.DWord);
}
}
/// <summary>
/// MessageLoopApartment
/// STA thread with message pump for serial execution of tasks
/// by Noseratio - http://stackoverflow.com/a/22262976/1768303
/// </summary>
public class MessageLoopApartment : IDisposable
{
Thread _thread; // the STA thread
TaskScheduler _taskScheduler; // the STA thread's task scheduler
public TaskScheduler TaskScheduler { get { return _taskScheduler; } }
/// <summary>MessageLoopApartment constructor</summary>
public MessageLoopApartment()
{
var tcs = new TaskCompletionSource<TaskScheduler>();
// start an STA thread and gets a task scheduler
_thread = new Thread(startArg =>
{
EventHandler idleHandler = null;
idleHandler = (s, e) =>
{
// handle Application.Idle just once
Application.Idle -= idleHandler;
// return the task scheduler
tcs.SetResult(TaskScheduler.FromCurrentSynchronizationContext());
};
// handle Application.Idle just once
// to make sure we're inside the message loop
// and SynchronizationContext has been correctly installed
Application.Idle += idleHandler;
Application.Run();
});
_thread.SetApartmentState(ApartmentState.STA);
_thread.IsBackground = true;
_thread.Start();
_taskScheduler = tcs.Task.Result;
}
/// <summary>shutdown the STA thread</summary>
public void Dispose()
{
if (_taskScheduler != null)
{
var taskScheduler = _taskScheduler;
_taskScheduler = null;
// execute Application.ExitThread() on the STA thread
Task.Factory.StartNew(
() => Application.ExitThread(),
CancellationToken.None,
TaskCreationOptions.None,
taskScheduler).Wait();
_thread.Join();
_thread = null;
}
}
/// <summary>Task.Factory.StartNew wrappers</summary>
public void Invoke(Action action)
{
Task.Factory.StartNew(action,
CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Wait();
}
public TResult Invoke<TResult>(Func<TResult> action)
{
return Task.Factory.StartNew(action,
CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Result;
}
public Task Run(Action action, CancellationToken token)
{
return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler);
}
public Task<TResult> Run<TResult>(Func<TResult> action, CancellationToken token)
{
return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler);
}
public Task Run(Func<Task> action, CancellationToken token)
{
return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap();
}
public Task<TResult> Run<TResult>(Func<Task<TResult>> action, CancellationToken token)
{
return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap();
}
}
}
这篇关于如何取消的超时后任务等待的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!