#!/usr/bin/python
'''A script that converts a CHM compiled HTML file into a single PDF file.

(c) 2012 Neil Schemenauer <nas@arctrix.com>

Losely based on a version of a script with the same name:
    (c) 2007 Massimo Sandal
    (c) 2007 Chris Karakas <http://www.karakas-online.de>
'''
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import sys
import os
import optparse
import subprocess
import shutil
import sgmllib
import tempfile
import urllib
import chm.chm as chm

USAGE = 'Usage: %prog [options] <filename>'
VERSION = '1.0'


class PageLister(sgmllib.SGMLParser):
    def reset(self):
        sgmllib.SGMLParser.reset(self)
        self.pages = []

    def start_param(self,attrs):
        attrs = dict(attrs)
        if attrs.get('name') == 'Local':
            self.pages.append(attrs['value'])


def list_topics(cfile):
    """List the URLs in the CHM topics tree.
    """
    page = cfile.GetTopicsTree()
    if not page:
        return []
    lister = PageLister()
    lister.feed(page)
    return lister.pages


class UrlCatcher(sgmllib.SGMLParser):
    def reset(self):
        sgmllib.SGMLParser.reset(self)
        self.urls = set()

    def start_img(self,attrs):
        for key, value in attrs:
            if key.lower() == 'src':
                self.urls.add(value)

    def start_link(self,attrs):
        for key, value in attrs:
            if key.lower() == 'href':
                self.urls.add(value)


def list_urls(page_data):
    """Return a set of URLs referenced in a HTML document.
    """
    c = UrlCatcher()
    c.feed(page_data)
    return c.urls


def fix_abs_urls(base_dir, filename):
    with open(filename, 'rU') as fp:
        page = fp.read()
    change = {}
    for url in list_urls(page):
        if url.startswith('/'):
            # absolute path, make relative to temp directory
            change['"%s"' % url] = '"%s%s"' % (urllib.quote(base_dir), url)
    if change:
        for old, new in change.items():
            if VERBOSE:
                print old, '->', new
            # this is a hack since it might change content that is no actually
            # a reference
            page = page.replace(old, new)
        with open(filename, 'w') as fp:
            fp.write(page)


def convert_pdf_wk(options, pages, outputfilename):
    pdf_opts = ['--page-size', options.page_size]
    if options.grayscale:
        pdf_opts.append('-g')
    if not VERBOSE:
        pdf_opts.append('--quiet')
    cmd = ['wkhtmltopdf'] + pdf_opts + pages + [outputfilename]
    if VERBOSE:
        print ' '.join(cmd)
    if not options.extract_only:
        subprocess.call(cmd)


def convert_pdf_htmldoc(options, pages, outputfilename):
    cmd = ['htmldoc', '--continuous', '--duplex', '--format', 'pdf14',
           '--outfile', outputfilename, '--jpeg=90',
           #'--header', 'c C', '--footer', 'c C',
           #'--bodyfont', 'times', '--linkstyle', 'plain',
           '--size', options.page_size.lower()]
    if options.grayscale:
        cmd.append('--gray')
    else:
        cmd.append('--color')
    if not VERBOSE:
        cmd.append('--quiet')
    cmd.extend(pages)
    if VERBOSE:
        print ' '.join(cmd)
    if not options.extract_only:
        subprocess.call(cmd)


def main(argv):
    global VERBOSE
    parser = optparse.OptionParser(USAGE, version=VERSION)
    parser.add_option('--output', '-o', default=None)
    parser.add_option('-v', '--verbose',
                      action='store_true', dest='verbose', default=0,
                      help="enable extra status output")
    parser.add_option('-k', '--keep',
                      action='store_true',
                      help="keep temporary directory of extracted content")
    parser.add_option('-e', '--extract-only',
                      action='store_true', dest='extract_only',
                      help="extract content but do not create PDF")
    parser.add_option('-s', '--page-size', default='A4',
                      help="Set paper size to : A4, Letter, etc")
    parser.add_option('-g', '--grayscale', action='store_true',
                      help="PDF will be generated in grayscale")
    parser.add_option('--htmldoc', action='store_true',
                      help="use htmldoc create PDF (default "
                           "is to use wkhtmltopdf)")
    options, args = parser.parse_args()
    VERBOSE = options.verbose
    if options.extract_only:
        options.keep = True

    if len(args) != 1:
        parser.print_usage()
        raise SystemExit

    filename = os.path.abspath(args[0])

    if not os.path.exists(filename):
        raise SystemExit('CHM file "' + filename + '" not found!')

    cfile = chm.CHMFile()
    cfile.LoadCHM(filename)

    basename, ext = os.path.splitext(os.path.basename(filename))
    dirname = os.path.dirname(filename)
    if options.output:
        outputfilename = os.path.abspath(options.output)
    else:
        outputfilename = os.path.join(dirname, basename +'.pdf')

    base_dir = tempfile.mkdtemp(prefix='chm2pdf.')
    try:
        os.chdir(base_dir)
        if VERBOSE:
            print 'extracting to', base_dir
        cmd = ['extract_chmLib', filename, '.']
        p = subprocess.Popen(cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT)
        out, err = p.communicate()
        if VERBOSE:
            sys.stdout.write(out)

        urls = list_topics(cfile)
        if not urls:
            raise SystemExit('No pages found.')
        pages = []
        for url in urls:
            filename = urllib.unquote(url)
            if filename.startswith('/'):
                filename = filename[1:]
            if os.path.exists(filename):
                pages.append(filename)
                fix_abs_urls(base_dir, filename)
            else:
                if VERBOSE:
                    print 'cannot find %r, ignoring' % filename

        if options.htmldoc:
            convert_pdf_htmldoc(options, pages, outputfilename)
        else:
            convert_pdf_wk(options, pages, outputfilename)
    finally:
        if options.keep:
            print 'keeping extracted content as %r' % base_dir
        else:
            shutil.rmtree(base_dir)


if __name__ == '__main__':
    main(sys.argv)