SkyDrive: Why aren't thou nicer to me? or "How to crawl pages and download all images using C#"

/********************************************************************************************************************************************************************************************************************************************************************************************************************
 * 
 * Example Application for crawling web pages and downloading images.
 * 
 * This code works if you pass in a SkyDrive Folder Url (http://.... /browse.aspx/...) 
 * and will download any jpg images it finds in there.
 * 
 * Permission to use, copy, modify, distribute and sell this software and its 
 * documentation for any purpose is hereby granted without fee.
 * I make no representations about the suitability of this software for any purpose.
 * It is provided "as is" without express or implied warranty.
 * 
 * Alex Duggleby - 24.05.08 - V0.9 - http://alexduggleby.com
 ********************************************************************************************************************************************************************************************************************************************************************************************************************/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
using System.IO;
using System.Web;
using System.ComponentModel;

namespace Tools.SkyDrive.DownloadAll
{
    class Program
    {
        // Used for tracking how many items we have left
        private static int _wcInnerCount = 0;
        private static int _wcInnerCompleted = 0;

        // We have to start somewhere
        private static Uri _uriStart;

        // Work we have already done
        private readonly static List _urisCrawled = new List();
        private readonly static List _imagesDownloaded = new List();

        // Download images to?
        private readonly static DirectoryInfo _diDownloadTo = new DirectoryInfo(Path.Combine(Path.Combine(System.Environment.GetFolderPath(Environment.SpecialFolder.Personal), "Downloads"),"Images"));
        // This finds urls in the page
        private readonly static Regex _regexUrl = new Regex("href\\s*=\\s*(?:(?:\\\"(?<url>[^\\\"]*)\\\")|(?<url>[^\\s]* ))");

        // This finds the open url in the image page
        private readonly static Regex _regexUrlOpen = new Regex("href\\s*=\\s*(?:(?:\\\"(?<url>[^\\\"]*)\\\")|(?<url>[^\\s]*)) title=\\\"Open\\\"");

        ///
        /// Takes the url to a skydrive folder page and downloads all jpg images.
        ///
        static void Main(string[] args)
        {
            // Usage check
            if (args.Length != 1)
            {
                Console.WriteLine("Usage: App.exe http://theUrlToThe/SkyDrive/FolderPage");
                return;
            }

            try
            {
                // First parameter is url
                _uriStart = new Uri(args[0]);
            }
            catch (Exception _ex)
            {
                Console.WriteLine("Invalid Url. " + _ex.Message);
                return;
            }

            // Make sure download directory exists
            if (!_diDownloadTo.Exists)
                _diDownloadTo.Create();

            using (WebClient _wc = new WebClient())
            {
                // This is the index with all the images
                string _pageContents = _wc.DownloadString(_uriStart);

                // Each image has a preview page, so we get the url to that, before we get the url to the actual image
                foreach (Match _matchUrlToImagePage in _regexUrl.Matches(_pageContents))
                {
                    Uri _uriToImagePage = new Uri(_uriStart, HttpUtility.HtmlDecode(_matchUrlToImagePage.Groups["url"].Value));

                    CrawlPreviewPage(_uriToImagePage);
                }
            }

            // Wait for the async web clients to complete...
            while (_wcInnerCompleted < _wcInnerCount)
            {
                Console.WriteLine("Wait for images to complete...");
                Console.ReadLine();
            }

            Console.WriteLine("Should be finished!");
            Console.ReadLine();
        }

        ///
        /// Parses the preview page and finds the actual image link
        ///
        /// The url to the preview page
        private static void CrawlPreviewPage(Uri uriToImagePage)
        {
            using (WebClient _wc = new WebClient())
            {
                if (!_urisCrawled.Contains(uriToImagePage.ToString()))
                {
                    _urisCrawled.Add(uriToImagePage.ToString());

                    if (uriToImagePage.ToString().ToLower().EndsWith(".jpg"))
                    {
                        string _pageContents = _wc.DownloadString(uriToImagePage);

                        // Find the image we want to download... There should be
                        // only one link with title="Open" in it.
                        foreach (Match _matchImage in _regexUrlOpen.Matches(_pageContents))
                        {
                            Uri _uriToImage = new Uri(_matchImage.Groups["url"].Value);

                            DownloadImage(_uriToImage);
                        }
                    }
                }
            }
        }

        ///
        /// Downloads async'ly an image from a Uri
        ///
        /// The uri to download
        private static void DownloadImage(Uri uriToImage)
        {
            // Output the url
            Console.WriteLine("{0}{1}", uriToImage.ToString(), Environment.NewLine);

            if (!_imagesDownloaded.Contains(uriToImage.ToString()))
            {
                _imagesDownloaded.Add(uriToImage.ToString());
                string _lowerUrl = uriToImage.ToString().ToLower();

                // Simple checking
                if (_lowerUrl.EndsWith(".jpg") && (!_lowerUrl.Contains("browse")) && (!_lowerUrl.Contains("self")))
                {
                    // HtmlDecode here because some urls have encoded characters
                    string _localFilename = HttpUtility.HtmlDecode( uriToImage.Segments[uriToImage.Segments.Length - 1]);

                    // Create a valid local filename
                    Path.GetInvalidPathChars().ToList().ForEach(
                        c => _localFilename = _localFilename.Replace(c, '_'));

                    Console.Write("Downloading {0}...{1}", _localFilename, Environment.NewLine);

                    // Create a seperate web client for each image (uses async, and you can't
                    // issue two downloads at the same time for the same client). Of course
                    // here we should be using some kind of pooling but this is the quickest
                    // way to do it.
                    using (WebClient _wcInner = new WebClient())
                    {
                        _wcInnerCount++;
                        _wcInner.DownloadFileAsync(uriToImage, Path.Combine(_diDownloadTo.ToString(), _localFilename));
                        _wcInner.DownloadFileCompleted += new AsyncCompletedEventHandler(_wcInner_DownloadFileCompleted);
                    }
                }
            }
        }

        // Is fired when a download complete. We output status and check if we are finished!
        private static void _wcInner_DownloadFileCompleted(object sender, AsyncCompletedEventArgs e)
        {
            // Increase the completed counter
            _wcInnerCompleted++;

            // Ok, we could do some more extensive checking, this could trigger
            // even if there are still items to download... but hey, it's just a
            // quick utility!
            if (_wcInnerCompleted == _wcInnerCount)
            {
                Console.WriteLine("{0}{1}{2}", Environment.NewLine, "Finished all files!", Environment.NewLine);
                Console.ReadLine();
            }
            else
            {
                Console.WriteLine("File {0} of {1} completed!", _wcInnerCompleted, _wcInnerCount);
            }
        }
    }
}
SkyDrive: Why aren't thou nicer to me? or "How to crawl pages and download all images using C#"

Related Articles

Restoring a backup. Wait, where are my permissions? or "How to replace all permissions in one directory tree with the permissions from another identical tree"

Tech-Ed: Download session files now

OneDrive PSA: the day it forgot about history