/********************************************************************************************************************************************************************************************************************************************************************************************************************
*
* Example Application for crawling web pages and downloading images.
*
* This code works if you pass in a SkyDrive Folder Url (http://.... /browse.aspx/...)
* and will download any jpg images it finds in there.
*
* Permission to use, copy, modify, distribute and sell this software and its
* documentation for any purpose is hereby granted without fee.
* I make no representations about the suitability of this software for any purpose.
* It is provided "as is" without express or implied warranty.
*
* Alex Duggleby - 24.05.08 - V0.9 - http://alexduggleby.com
********************************************************************************************************************************************************************************************************************************************************************************************************************/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
using System.IO;
using System.Web;
using System.ComponentModel;
namespace Tools.SkyDrive.DownloadAll
{
class Program
{
// Used for tracking how many items we have left
private static int _wcInnerCount = 0;
private static int _wcInnerCompleted = 0;
// We have to start somewhere
private static Uri _uriStart;
// Work we have already done
private readonly static List _urisCrawled = new List();
private readonly static List _imagesDownloaded = new List();
// Download images to?
private readonly static DirectoryInfo _diDownloadTo = new DirectoryInfo(Path.Combine(Path.Combine(System.Environment.GetFolderPath(Environment.SpecialFolder.Personal), "Downloads"),"Images"));
// This finds urls in the page
private readonly static Regex _regexUrl = new Regex("href\\s*=\\s*(?:(?:\\\"(?<url>[^\\\"]*)\\\")|(?<url>[^\\s]* ))");
// This finds the open url in the image page
private readonly static Regex _regexUrlOpen = new Regex("href\\s*=\\s*(?:(?:\\\"(?<url>[^\\\"]*)\\\")|(?<url>[^\\s]*)) title=\\\"Open\\\"");
///
/// Takes the url to a skydrive folder page and downloads all jpg images.
///
static void Main(string[] args)
{
// Usage check
if (args.Length != 1)
{
Console.WriteLine("Usage: App.exe http://theUrlToThe/SkyDrive/FolderPage");
return;
}
try
{
// First parameter is url
_uriStart = new Uri(args[0]);
}
catch (Exception _ex)
{
Console.WriteLine("Invalid Url. " + _ex.Message);
return;
}
// Make sure download directory exists
if (!_diDownloadTo.Exists)
_diDownloadTo.Create();
using (WebClient _wc = new WebClient())
{
// This is the index with all the images
string _pageContents = _wc.DownloadString(_uriStart);
// Each image has a preview page, so we get the url to that, before we get the url to the actual image
foreach (Match _matchUrlToImagePage in _regexUrl.Matches(_pageContents))
{
Uri _uriToImagePage = new Uri(_uriStart, HttpUtility.HtmlDecode(_matchUrlToImagePage.Groups["url"].Value));
CrawlPreviewPage(_uriToImagePage);
}
}
// Wait for the async web clients to complete...
while (_wcInnerCompleted < _wcInnerCount)
{
Console.WriteLine("Wait for images to complete...");
Console.ReadLine();
}
Console.WriteLine("Should be finished!");
Console.ReadLine();
}
///
/// Parses the preview page and finds the actual image link
///
/// The url to the preview page
private static void CrawlPreviewPage(Uri uriToImagePage)
{
using (WebClient _wc = new WebClient())
{
if (!_urisCrawled.Contains(uriToImagePage.ToString()))
{
_urisCrawled.Add(uriToImagePage.ToString());
if (uriToImagePage.ToString().ToLower().EndsWith(".jpg"))
{
string _pageContents = _wc.DownloadString(uriToImagePage);
// Find the image we want to download... There should be
// only one link with title="Open" in it.
foreach (Match _matchImage in _regexUrlOpen.Matches(_pageContents))
{
Uri _uriToImage = new Uri(_matchImage.Groups["url"].Value);
DownloadImage(_uriToImage);
}
}
}
}
}
///
/// Downloads async'ly an image from a Uri
///
/// The uri to download
private static void DownloadImage(Uri uriToImage)
{
// Output the url
Console.WriteLine("{0}{1}", uriToImage.ToString(), Environment.NewLine);
if (!_imagesDownloaded.Contains(uriToImage.ToString()))
{
_imagesDownloaded.Add(uriToImage.ToString());
string _lowerUrl = uriToImage.ToString().ToLower();
// Simple checking
if (_lowerUrl.EndsWith(".jpg") && (!_lowerUrl.Contains("browse")) && (!_lowerUrl.Contains("self")))
{
// HtmlDecode here because some urls have encoded characters
string _localFilename = HttpUtility.HtmlDecode( uriToImage.Segments[uriToImage.Segments.Length - 1]);
// Create a valid local filename
Path.GetInvalidPathChars().ToList().ForEach(
c => _localFilename = _localFilename.Replace(c, '_'));
Console.Write("Downloading {0}...{1}", _localFilename, Environment.NewLine);
// Create a seperate web client for each image (uses async, and you can't
// issue two downloads at the same time for the same client). Of course
// here we should be using some kind of pooling but this is the quickest
// way to do it.
using (WebClient _wcInner = new WebClient())
{
_wcInnerCount++;
_wcInner.DownloadFileAsync(uriToImage, Path.Combine(_diDownloadTo.ToString(), _localFilename));
_wcInner.DownloadFileCompleted += new AsyncCompletedEventHandler(_wcInner_DownloadFileCompleted);
}
}
}
}
// Is fired when a download complete. We output status and check if we are finished!
private static void _wcInner_DownloadFileCompleted(object sender, AsyncCompletedEventArgs e)
{
// Increase the completed counter
_wcInnerCompleted++;
// Ok, we could do some more extensive checking, this could trigger
// even if there are still items to download... but hey, it's just a
// quick utility!
if (_wcInnerCompleted == _wcInnerCount)
{
Console.WriteLine("{0}{1}{2}", Environment.NewLine, "Finished all files!", Environment.NewLine);
Console.ReadLine();
}
else
{
Console.WriteLine("File {0} of {1} completed!", _wcInnerCompleted, _wcInnerCount);
}
}
}
}