#! /usr/bin/python3
#
# Copyright (C) 1998-2018 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

"""Poll the NNTP servers for messages to be gatewayed to mailing lists.

Usage: gate_news [options]

Where options are

    --help
    -h
        Print this text and exit.

"""

import sys
import os
import time
import getopt
import socket
import nntplib

import paths
# Import this /after/ paths so that the sys.path is properly hacked
import email.errors
from email.parser import Parser

from Mailman import mm_cfg
from Mailman import MailList
from Mailman import Utils
from Mailman import Message
from Mailman import LockFile
from Mailman.i18n import _
from Mailman.Queue.sbcache import get_switchboard
from Mailman.Logging.Utils import LogStdErr
from Mailman.Logging.Syslog import syslog

# Work around known problems with some RedHat cron daemons
import signal
signal.signal(signal.SIGCHLD, signal.SIG_DFL)

GATENEWS_LOCK_FILE = os.path.join(mm_cfg.LOCK_DIR, 'gate_news.lock')

LogStdErr('error', 'gate_news', manual_reprime=0)

LOCK_LIFETIME = mm_cfg.hours(2)
NL = '\n'

# Continues inside try: block are not allowed in Python versions before 2.1.
# This exception is used to work around that.
class _ContinueLoop(Exception):
    pass


def usage(code, msg=''):
    if code:
        fd = sys.stderr
    else:
        fd = sys.stdout
    print(_(__doc__), file=fd)
    if msg:
        print(msg, file=fd)
    sys.exit(code)


_hostcache = {}

def open_newsgroup(mlist):
    # Split host:port if given
    nntp_host, nntp_port = Utils.nntpsplit(mlist.nntp_host)
    # Open up a "mode reader" connection to nntp server.  This will be shared
    # for all the gated lists having the same nntp_host.
    conn = _hostcache.get(mlist.nntp_host)
    if conn is None:
        try:
            conn = nntplib.NNTP(nntp_host, nntp_port,
                                readermode=True,
                                user=mm_cfg.NNTP_USERNAME,
                                password=mm_cfg.NNTP_PASSWORD)
        except (socket.error, nntplib.NNTPError, IOError, EOFError) as e:
            syslog('fromusenet',
                   'error opening connection to nntp_host: %s\n%s',
                   mlist.nntp_host, e)
            raise
        _hostcache[mlist.nntp_host] = conn
    # Get the GROUP information for the list, but we're only really interested
    # in the first article number and the last article number
    r,c,f,l,n = conn.group(mlist.linked_newsgroup)
    return conn, int(f), int(l)


def clearcache():
    reverse = {}
    for conn in _hostcache.values():
        reverse[conn] = 1
    for conn in reverse.keys():
        conn.quit()
    _hostcache.clear()




# This function requires the list to be locked.
def poll_newsgroup(mlist, conn, first, last, glock):
    listname = mlist.internal_name()
    syslog('fromusenet', 'poll_newsgroup for %d -> %d', first, last)
    
    for num in range(first, last):
        glock.refresh()
        try:
            # Get the article headers
            try:
                result = conn.head(repr(num))
                syslog('fromusenet', 'head() returned type: %s', type(result).__name__)
                
                # Extract headers based on the return format
                headers = None
                
                if isinstance(result, tuple):
                    if len(result) > 3:
                        # Standard format: (response, number, id, headers)
                        headers = result[3]
                    elif len(result) == 2 and hasattr(result[1], 'lines'):
                        # Format with ArticleInfo object
                        headers = result[1].lines
                    elif len(result) == 2 and isinstance(result[1], list):
                        # Another possible format
                        headers = result[1]
                
                # If we still don't have headers, try to find them in any element
                if headers is None:
                    for item in result:
                        if isinstance(item, (list, tuple)) and item:
                            headers = item
                            break
                
                if headers is None:
                    syslog('fromusenet', 'Could not extract headers from: %s', repr(result))
                    raise _ContinueLoop
                
            except IndexError:
                syslog('fromusenet', 'IndexError with result: %s', repr(result))
                raise _ContinueLoop
            except Exception as e:
                syslog('fromusenet', 'Error getting headers for %d: %s', num, str(e))
                raise _ContinueLoop
            
            # I don't know how this happens, but skip an empty message.
            if not headers:
                syslog('fromusenet', 'Empty headers for message %d', num)
                raise _ContinueLoop
            
            found_to = 0
            beenthere = 0
            for header in headers:
                # Make sure the header is a string
                if not isinstance(header, str) and hasattr(header, 'decode'):
                    header = header.decode('utf-8', errors='replace')
                
                i = header.find(':')
                if i <= 0:
                    continue
                
                value = header[:i].lower()
                if value == 'to':
                    found_to = 1
                if value != 'x-beenthere':
                    continue
                if header[i:] == ': %s' % mlist.GetListEmail():
                    beenthere = 1
                    break
            
            if not beenthere:
                # Usenet originated messages will not have a Unix envelope
                # (i.e. "From " header).  This breaks Pipermail archiving, so
                # we will synthesize one.  Be sure to use the format searched
                # for by mailbox.UnixMailbox._isrealfromline().  BAW: We use
                # the -bounces address here in case any downstream clients use
                # the envelope sender for bounces; I'm not sure about this,
                # but it's the closest to the old semantics.

                # Get the body of the article
                try:
                    body_result = conn.body(repr(num))
                    
                    # Extract body based on the return format
                    body = None
                    
                    if isinstance(body_result, tuple):
                        if len(body_result) > 3:
                            body = body_result[3]
                        elif len(body_result) == 2 and hasattr(body_result[1], 'lines'):
                            body = body_result[1].lines
                        elif len(body_result) == 2 and isinstance(body_result[1], list):
                            body = body_result[1]
                    
                    # If we still don't have the body, try to find it in any element
                    if body is None:
                        for item in body_result:
                            if isinstance(item, (list, tuple)) and item:
                                body = item
                                break
                    
                    if body is None:
                        syslog('fromusenet', 'Could not extract body from: %s', repr(body_result))
                        raise _ContinueLoop
                        
                except Exception as e:
                    syslog('fromusenet', 'Error getting body for %d: %s', num, str(e))
                    raise _ContinueLoop
                
                # Convert all headers and body items to strings if they're bytes
                str_headers = []
                for header in headers:
                    if not isinstance(header, str) and hasattr(header, 'decode'):
                        str_headers.append(header.decode('utf-8', errors='replace'))
                    else:
                        str_headers.append(str(header))
                
                str_body = []
                for line in body:
                    if not isinstance(line, str) and hasattr(line, 'decode'):
                        str_body.append(line.decode('utf-8', errors='replace'))
                    else:
                        str_body.append(str(line))
                
                # Create the full message
                lines = ['From %s  %s' % (mlist.GetBouncesEmail(),
                                        time.ctime(time.time()))]
                lines.extend(str_headers)
                lines.append('')
                lines.extend(str_body)
                lines.append('')
                lines.append('')
                
                # Parse the message
                p = Parser(Message.Message)
                try:
                    msg = p.parsestr(NL.join(lines))
                except email.errors.MessageError as e:
                    syslog('fromusenet',
                           'email package exception for %s:%d\n%s',
                           mlist.linked_newsgroup, num, e)
                    raise _ContinueLoop
                
                # Handle To: header
                if found_to:
                    del msg['X-Originally-To']
                    msg['X-Originally-To'] = msg['To']
                    del msg['To']
                msg['To'] = mlist.GetListEmail()
                
                # Post the message to the locked list
                inq = get_switchboard(mm_cfg.INQUEUE_DIR)
                inq.enqueue(msg,
                            listname = mlist.internal_name(),
                            fromusenet = 1)
                syslog('fromusenet',
                       'posted to list %s: %7d' % (listname, num))
                
        except nntplib.NNTPError as e:
            syslog('fromusenet',
                   'NNTP error for list %s: %7d' % (listname, num))
            syslog('fromusenet', str(e))
        except _ContinueLoop:
            continue
        except Exception as e:
            syslog('fromusenet', 
                   'Unexpected error processing article %7d: %s', num, str(e))
            continue
    
    # Even if we don't post the message because it was seen on the
    # list already, or if we skipped it as unparseable or empty,
    # update the watermark. Note this used to be in the 'for' block
    # but if the last message(s) raised _ContinueLoop, they wouldn't
    # update the watermark.
    mlist.usenet_watermark = num




def process_lists(glock):
    for listname in Utils.list_names():
        glock.refresh()
        # Open the list unlocked just to check to see if it is gating news to
        # mail.  If not, we're done with the list.  Otherwise, lock the list
        # and gate the group.
        mlist = MailList.MailList(listname, lock=0)
        if not mlist.gateway_to_mail:
            continue
        # Get the list's watermark, i.e. the last article number that we gated
        # from news to mail.  `None' means that this list has never polled its
        # newsgroup and that we should do a catch up.
        watermark = getattr(mlist, 'usenet_watermark', None)
        # Open the newsgroup, but let most exceptions percolate up.
        try:
            conn, first, last = open_newsgroup(mlist)
        except (socket.error, nntplib.NNTPError, IOError, EOFError) as e:
            syslog('fromusenet',
                   "%s: couldn't open newsgroup %s: skipping\n%s",
                   listname, mlist.linked_newsgroup, e)
            continue
        syslog('fromusenet', '%s: [%d..%d]' % (listname, first, last))
        try:
            try:
                if watermark is None:
                    mlist.Lock(timeout=mm_cfg.LIST_LOCK_TIMEOUT)
                    # This is the first time we've tried to gate this
                    # newsgroup.  We essentially do a mass catch-up, otherwise
                    # we'd flood the mailing list.
                    mlist.usenet_watermark = last
                    syslog('fromusenet', '%s caught up to article %d' %
                           (listname, last))
                else:
                    # The list has been polled previously, so now we simply
                    # grab all the messages on the newsgroup that have not
                    # been seen by the mailing list.  The first such article
                    # is the maximum of the lowest article available in the
                    # newsgroup and the watermark.  It's possible that some
                    # articles have been expired since the last time gate_news
                    # has run.  Not much we can do about that.
                    start = max(watermark+1, first)
                    if start > last:
                        syslog('fromusenet', 'nothing new for list %s' %
                               listname)
                    else:
                        mlist.Lock(timeout=mm_cfg.LIST_LOCK_TIMEOUT)
                        syslog('fromusenet', 'gating %s articles [%d..%d]' %
                               (listname, start, last))
                        # Use last+1 because poll_newsgroup() employes a for
                        # loop over range, and this will not include the last
                        # element in the list.
                        poll_newsgroup(mlist, conn, start, last+1, glock)
            except LockFile.TimeOutError:
                syslog('fromusenet', 'Could not acquire list lock: %s' %
                       listname)
        finally:
            if mlist.Locked():
                mlist.Save()
                mlist.Unlock()
        syslog('fromusenet', '%s watermark: %d' %
               (listname, mlist.usenet_watermark))




def main():
    lock = LockFile.LockFile(GATENEWS_LOCK_FILE,
                             # it's okay to hijack this
                             lifetime=LOCK_LIFETIME)
    try:
        lock.lock(timeout=0.5)
    except LockFile.TimeOutError:
        syslog('fromusenet', 'Could not acquire gate_news lock')
        return
    try:
        process_lists(lock)
    finally:
        clearcache()
        lock.unlock(unconditionally=1)




if __name__ == '__main__':
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'h', ['help'])
    except getopt.error as msg:
        usage(1, msg)

    if args:
        usage(1, 'No args are expected')

    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)

    main()
