#!/usr/bin/env python
# -*- mode: python; coding: koi8-r; -*-

import libxml2
from textparagraph import TextParagraph

__DEBUG__ = True

class FB2Parser:

    def __init__(self):

        self.current_paragraph = None

        #self.content = TextContent()
        self.content = []
        self.footnotes = {}
        self.footnotes_index = 1
        self.images = {}
        self.genres = []
        self.book_title = ''
        self.authors = []
        self.lang = ''

        self.title_level = 0
        self.titles_list = []

        self.cover = []
        self.description = []

        self.body_is_footnote = False

        #self.current_footnote = None
        self.current_footnote_id = None

        #self.current_body_name = None


    def addEmptyLine(self):
        # добавляет пустую строку, если предыдущая не пустая
        if self.content and self.content[-1].data: # todo self.content[-1].type != 'image'
            # предыдущий параграф не пустой
            self.content.append(TextParagraph('default'))


    def parseFB2(self, data):

        doc = libxml2.parseMemory(data, len(data))

        current = doc.children

        # skip comment
        while current:
            if current.name == 'FictionBook':
                break
            current = current.next
        current = current.children

        while current:

            if current.name == 'stylesheet':
                self.parseStylesheet(current)

            elif current.name == 'description':
                self.parseDescription(current)

            elif current.name == 'body':
                self.parseBody(current)

            elif current.name == 'binary':
                self.parseBinary(current)

            current = current.next

        doc.freeDoc()


    def parseStylesheet(self, current):
        #print 'parseStylesheet'
        pass


    def parseDescription(self, current):
        #print 'parseDescription'
        current = current.children

        while current:

            if current.name == 'title-info':
                self.parseTitleInfo(current)
            elif current.name == 'document-info':
                self.parseDocumentInfo(current)

            current = current.next


    def parseTitleInfo(self, current):
        current = current.children

        while current:

            if current.name == 'genre':
                if current.children and current.children.content:
                    self.genres.append(current.children.content)

            elif current.name == 'lang':
                if current.children and current.children.content:
                    self.lang = current.children.content

            elif current.name == 'author':
                self.parseAuthor(current)

            elif current.name == 'book-title':
                if current.children and current.children.content:
                    self.book_title = current.children.content

            elif current.name == 'annotation':
                self.parseAnnotation(current)
                self.description = self.content
                self.content = []

            elif current.name == 'coverpage':
                # cover
                c = current.children
                while c:
                    if c.name == 'image' and c.prop('href'):
                        self.cover.append(c.prop('href')[1:])
                    c = c.next

            current = current.next


    def parseAuthor(self, current):
        current = current.children
        author = []
        while current:
            if current.name in ['first-name', 'middle-name', 'last-name']:
                author.append(unicode(current.content, 'utf-8'))
            elif current.name == 'nick-name':
                author.append('('+unicode(current.content, 'utf-8')+')')
            current = current.next
        author = ' '.join(author)
        self.authors.append(author)
        #print '>%s<' % author


    def parseDocumentInfo(self, current):
        current = current.children

        while current:

            if current.name == 'title':
                self.parseTitle(current)

            elif current.name == 'section':
                self.parseSection(current)

            current = current.next


    def parseBody(self, current):
        #print 'parseBody'

        if current.prop('name'): # in ['footnote', 'notes']:
            self.body_is_footnote = True
        else:
            self.body_is_footnote = False

##         if current.prop('name'):
##             self.current_body_name = current.prop('name')

        current = current.children

        while current:

            if current.name == 'title':
                self.parseTitle(current)
                # + empty-line

            elif current.name == 'section':
                self.parseSection(current)

            elif current.name == 'image':
                self.parseImage(current)

            elif current.name == 'epigraph':
                self.parseEpigraph(current)

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parseBody:', current.name

            current = current.next


    def parseTitle(self, current):
        #print 'parseTitle'
        title_text = []
        title_index = len(self.content)

        current = current.children

        self.addEmptyLine()

        while current:

            if current.name == 'p':

                self.current_paragraph = TextParagraph('title')
                self.content.append(self.current_paragraph)
                self.current_paragraph.title_level = self.title_level
                self.parseP(current)

                title_text.append(self.current_paragraph.data)

            elif current.name == 'empty-line':
                self.addEmptyLine()

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parseTitle:', current.name


            current = current.next

        self.titles_list.append((' '.join(title_text).replace('\n', ' ').strip(),
                                 title_index))
        # + empty-line
        self.addEmptyLine()


    def parseSection(self, current):
        #print 'parseSection'

        self.title_level += 1

        footnote_name = None
        if self.body_is_footnote and current.prop('id'):
            footnote_name = current.prop('id')
            footnotes_start = len(self.content)

        current = current.children

        while current:

            if current.name == 'title':
                if footnote_name:
                    footnotes_start += 1
                self.parseTitle(current)

            elif current.name == 'subtitle':
                self.parseSubtitle(current)

            elif current.name == 'epigraph':
                self.parseEpigraph(current)

            elif current.name == 'image':
                self.parseImage(current)

            elif current.name == 'annotation':
                self.parseAnnotation(current)

            elif current.name == 'section':
                self.parseSection(current)

            elif current.name == 'p':
                self.current_paragraph = TextParagraph('paragraph')
                self.content.append(self.current_paragraph)
                self.parseP(current)

            elif current.name == 'poem':
                self.parsePoem(current)

            elif current.name == 'cite':
                self.parseCite(current)

            elif current.name == 'empty-line':
                self.addEmptyLine()

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parseSection:', current.name

            current = current.next

        if footnote_name:
            #print footnote_name
            #print '>>', self.current_paragraph.data.encode('koi8-r', 'replace'), '<<'

            self.footnotes[footnote_name] = []

            if self.content[footnotes_start].type == 'title' \
               and len(self.content[footnotes_start:]) >= 3:

                par = TextParagraph('paragraph')
                title = self.content[footnotes_start].data
                if title[:2] == '* ' and title[-2:] == ' *':
                    title = title[2:-2]
                if title[-1] != '.': title += '.'
                par.data = title+' '+self.content[footnotes_start+2].data #[1:]??
                par.styles.append((0, len(title), 'strong'))
                self.footnotes[footnote_name].append(par)
                footnotes_start += 3

            for par in self.content[footnotes_start:]:
                self.footnotes[footnote_name].append(par)

        self.title_level -= 1


    def parseEpigraph(self, current):
        #print 'parseEpigraph'

        current = current.children

        while current:

            if current.name == 'p':
                self.current_paragraph = TextParagraph('epigraph')
                self.content.append(self.current_paragraph)
                self.parseP(current)

            elif current.name == 'poem':
                self.parsePoem(current)

            elif current.name == 'cite':
                self.parseCite(current)

            elif current.name == 'text-author':
                paragraph = TextParagraph('text author')
                self.content.append(paragraph)
                paragraph.data = unicode(current.children.content, 'utf-8')

            elif current.name == 'empty-line':
                self.addEmptyLine()

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parseEpigraph:', current.name

            current = current.next

        self.addEmptyLine()


    def parseImage(self, current):
        #print 'parseImage'

        if current.prop('href'):
            paragraph = TextParagraph('image')
            self.content.append(paragraph)
            paragraph.href = current.prop('href')[1:]


    def parseAnnotation(self, current):

        current = current.children

        while current:

            if current.name == 'p':
                self.current_paragraph = TextParagraph('paragraph')
                self.content.append(self.current_paragraph)
                self.parseP(current)

            elif current.name == 'poem':
                self.parsePoem(current)

            elif current.name == 'cite':
                self.parseCite(current)

            elif current.name == 'empty-line':
                self.addEmptyLine()

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parseAnnotation:', current.name

            current = current.next


    def parseP(self, current):
        #print 'parseP'

        footnote_name = None
        if self.body_is_footnote and current.prop('id'):
            footnote_name = current.prop('id')
            self.footnotes[footnote_name] = []
            self.current_footnote_id = footnote_name

        current = current.children

        while current:

            if current.name == 'style':
                self.parseStyle(current)

            elif current.name == 'strong':
                self.parseStrong(current)

            elif current.name == 'emphasis':
                self.parseEmphasis(current)

            elif current.name == 'a':
                self.parseLink(current)

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parseP:', current.name

            elif current.content:
                s = unicode(current.content, 'utf-8')
                self.current_paragraph.data += s

            current = current.next

        if footnote_name or self.current_footnote_id:
            #print footnote_name
            #print self.current_paragraph.data.encode('koi8-r', 'replace')
            #self.footnotes[footnote_name].append(self.current_paragraph)
            self.footnotes[self.current_footnote_id].append(self.current_paragraph)

    def parseStrong(self, current):

        current = current.children

        p = self.current_paragraph

        if current and current.content: # ???

            s = unicode(current.content, 'utf-8')
            self.current_paragraph.styles.append((len(self.current_paragraph.data), len(s), 'strong'))
            self.current_paragraph.data += s


    def parseEmphasis(self, current):

        current = current.children

        p = self.current_paragraph

        if current and current.content: # ???

            s = unicode(current.content, 'utf-8')
            self.current_paragraph.styles.append((len(self.current_paragraph.data), len(s), 'emphasis'))
            self.current_paragraph.data += s


    def parseStyle(self, current):

        ## style name="foreign lang" xml:lang="fr"
        lang = None
        if current.prop('name') == 'foreign lang':
            lang = current.prop('lang')
            offset = len(self.current_paragraph.data)

        #print '>>', current.prop('name'), current.prop('lang')

        current = current.children

        while current:
            if current.name == 'style':
                self.parseStyle(current)

            elif current.name == 'strong':
                self.parseStrong(current)

            elif current.name == 'emphasis':
                self.parseEmphasis(current)

            elif current.name == 'a':
                self.parseLink(current)

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parseStyle:', current.name

            elif current.content:
                s = unicode(current.content, 'utf-8')
                self.current_paragraph.data += s

            current = current.next

        if lang:
            self.current_paragraph.styles.append((offset, len(self.current_paragraph.data)-offset, 'lang=%s'%lang))


    def parseLink(self, current):
        #print 'parseLink'
        #print 'type->', current.prop('type')
        #print 'href->', current.prop('href')

        footnote_name = None

        if current.prop('type') == 'note' and current.prop('href'):
            footnote_name = current.prop('href')
            link_type = 'footnote'
            footnote_start = len(self.current_paragraph.data)
        elif current.prop('href'):
            footnote_name = current.prop('href')
            #footnote_start = -1
            link_type = 'hyperlink'
            footnote_start = len(self.current_paragraph.data)

        current = current.children

        while current:
            if current.name == 'style':
                self.parseStyle(current)

            elif current.name == 'strong':
                self.parseStrong(current)

            elif current.name == 'emphasis':
                self.parseEmphasis(current)

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parseLink:', current.name

            elif current.content:

                s = unicode(current.content, 'utf-8')
                if (s[0] == '[' and s[-1] == ']') \
                       or (s[0] == '{' and s[-1] == '}'):
                    s = s[1:-1]
                self.current_paragraph.data += s

            current = current.next

        if footnote_name:

            footnote_end = len(self.current_paragraph.data)
            footnote_length = footnote_end - footnote_start
            self.current_paragraph.footnotes.append((footnote_start,
                                                     footnote_end,
                                                     footnote_name[1:]))

            self.current_paragraph.styles.append((footnote_start,
                                                  footnote_length,
                                                  link_type))

##             if link_type == 'hyperlink': #footnote_start == -1:
##                 footnote_start = len(self.current_paragraph.data)
##                 self.current_paragraph.data += str(self.footnotes_index)
##                 self.footnotes_index += 1
##             else: # footnote
##                 footnote_end = len(self.current_paragraph.data)
##                 footnote_length = footnote_end - footnote_start
##                 self.current_paragraph.footnotes.append((footnote_start,
##                                                          footnote_end,
##                                                          footnote_name[1:]))
##                 self.current_paragraph.styles.append((footnote_start,
##                                                       footnote_length,
##                                                       'footnote'))


    def parsePoem(self, current):

        current = current.children

        while current:

            if current.name == "title":
                self.parseTitle(current)

            elif current.name == "epigraph":
                self.parseEpigraph(current)

            elif current.name == "p":
                self.parseP(current)

            elif current.name == "empty-line":
                self.addEmptyLine()

            elif current.name == "stanza":
                self.addEmptyLine()
                self.parseStanza(current)

            elif current.name == 'text-author':
                paragraph = TextParagraph('text author')
                self.content.append(paragraph)
                paragraph.data = unicode(current.children.content, 'utf-8')

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parsePoem:', current.name

            current = current.next

        self.addEmptyLine()


    def parseStanza(self, current):

        current = current.children

        while current:
            #print current.name

            if current.name == 'v':
                self.current_paragraph = TextParagraph('stanza')
                self.content.append(self.current_paragraph)
                self.parseV(current)

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parseStanza:', current.name

            current = current.next


    def parseV(self, current):

        current = current.children

        while current:
            #print current.name

            if current.name == 'a':
                self.parseLink(current)

            elif current.name == 'style':
                self.parseStyle(current)

            elif current.name == 'strong':
                self.parseStrong(current)

            elif current.name == 'emphasis':
                self.parseEmphasis(current)

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parseV:', current.name

            elif current.content:
                s = unicode(current.content, 'utf-8')
                self.current_paragraph.data += s

            current = current.next


    def parseSubtitle(self, current):

        self.addEmptyLine()
        self.current_paragraph = TextParagraph('subtitle')
        self.content.append(self.current_paragraph)
        self.parseP(current)
        self.addEmptyLine()

##         current = current.children
##         self.current_paragraph = TextParagraph('subtitle')
##         self.content.append(self.current_paragraph)
##         while current:
##             if current.name == 'p':
##                 self.parseP(current)
##             elif __DEBUG__ and current.name != 'text':
##                 print 'unsupported tag:', 'parseSubtitle:', current.name
##             elif current.content:
##                 s = unicode(current.content, 'utf-8')
##                 self.current_paragraph.data += s
##             current = current.next
##         self.addEmptyLine()


    def parseCite(self, current):
        current = current.children

        while current:

            if current.name == 'p':
                self.current_paragraph = TextParagraph('cite')
                self.content.append(self.current_paragraph)
                self.parseP(current)

            elif current.name == 'poem':
                self.parsePoem(current)

            elif current.name == 'empty-line':
                self.addEmptyLine()

            elif current.name == 'text-author':
                paragraph = TextParagraph('text author')
                self.content.append(paragraph)
                paragraph.data = unicode(current.children.content, 'utf-8')

            elif current.name == 'subtitle':
                self.parseSubtitle(current)

            elif __DEBUG__ and current.name != 'text':
                print 'unsupported tag:', 'parseCite:', current.name

            current = current.next


    def parseBinary(self, current):

        if current.prop('id'):
            id = current.prop('id')
            data = ''
            current = current.children

            while current:
                if current.content:
                    data += current.content
                current = current.next

            if data:
                self.images[id]=data

# test
if __name__ == '__main__':

    import sys

    if len(sys.argv) != 2:
        sys.exit('usage: %s filename' % sys.argv[0])

    fb = FB2Parser()
    fb.parseFB2(open(sys.argv[1]).read())


    #for i in fb.content:
    #    print i.type

    #print len(fb.content)

##     i = 0
##     for p in fb.content:
##         if p.type == 'image':
##             print 'href:', p.href, i, len(fb.images[p.href])
##         i += 1

##     for c in fb.cover:
##         print 'cover:', c, len(fb.images[c])

##     for i in fb.images:
##         print '---- id ----', i
##         print '-'*70
##         print fb.images[i]
##         print '-'*70

##     for p in fb.content:
##         for f in p.footnotes:
##             print '-'*72
##             print fb.footnotes[f[2]].data.encode('koi8-r', 'replace')