#!/usr/bin/env python
# -*- coding: utf-8 -*-

import toolforge
import requests
import re

# Constants
DB_LIST_URL = 'https://noc.wikimedia.org/conf/dblists/all.dblist'
ALLOWLIST_CSP = [
    'wikimedia.org', 'wikipedia.org', 'wikinews.org', 
    'wiktionary.org', 'wikibooks.org', 'wikiversity.org', 'wikisource.org', 
    'wikiquote.org', 'wikidata.org', 'wikifunctions.org', 'wikivoyage.org', 
    'mediawiki.org',
]

# Fetch the list of wikis
wikis = requests.get(DB_LIST_URL).text.split('\n')
wikis.pop()
wikis.pop(0)

# SQL query to fetch gadget page titles
SQL_QUERY = """
SELECT CONCAT('MediaWiki:', page.page_title) AS gadget
FROM page
WHERE page.page_namespace = 8
  AND page.page_content_model = 'javascript'
GROUP BY page.page_title
ORDER BY page.page_title DESC
LIMIT 500;
"""

# Helper function to fetch page content using MediaWiki API
def fetch_page_content(project, page_title):
    url = f'https://{project}/w/api.php'
    params = {
        'action': 'query',
        'format': 'json',
        'titles': page_title,
        'prop': 'revisions',
        'rvprop': 'content'
    }
    response = requests.get(url, params=params)
    pages = response.json().get('query', {}).get('pages', {})
    for page in pages.values():
        return page.get('revisions', [{}])[0].get('*', '')

def extract_domain(url):
    # Match and extract domain or subdomain from the URL
    match = re.match(r'(?:https?://)?([^/]+)', url)
    if match:
        return match.group(1)
    return None

def scan_for_third_party(content):
    pattern = re.compile(r'https?://([^\s\'"<>]+)/?')
    urls = pattern.findall(content)
    third_party_domains = set()

    for url in urls:
        extracted_domain = extract_domain(url)
        is_allowed_domain = False
        if extracted_domain is not None:
            for allowlist_domain in ALLOWLIST_CSP:
                if allowlist_domain in extracted_domain:
                    is_allowed_domain = True
                    break
            if not is_allowed_domain:
                third_party_domains.add(extracted_domain)

    return ', '.join(third_party_domains)

# Main script
gadgets = []
meta = toolforge.connect('meta')

for wiki in wikis:
    try:
        conn = toolforge.connect(wiki)
    except Exception as e:
        print(f"Could not connect to {wiki}: {e}")
        continue
    
    print(f"Scanning gadgets on {wiki}...")

    with conn.cursor() as cur:
        cur.execute(SQL_QUERY)
        data = cur.fetchall()
    
    if len(data) > 0:
        with meta.cursor() as cur:
            cur.execute('SELECT url FROM wiki WHERE dbname=%s', (wiki,))
            url = cur.fetchall()[0][0]
            project = url.replace('https://', '')
        
        wiki_gadgets = []

        for row in data:
            gadget = row[0].decode('utf-8')
            page_content = fetch_page_content(project, gadget)
            third_party_domains = scan_for_third_party(page_content)
            if third_party_domains:
                wiki_gadgets.append({
                    'project': project,
                    'gadget': gadget,
                    'third_party_domains': third_party_domains
                })

        if wiki_gadgets:
            print(f"Summary for {wiki}:")
            for gadget_info in wiki_gadgets:
                print(f"- Gadget: {gadget_info['gadget']}")
                print(f"  Third-party domains: {gadget_info['third_party_domains']}")
            gadgets.extend(wiki_gadgets)
        else:
            print(f"No gadgets with third-party domains found for {wiki}")

# Build and print the markdown table
output_file = 'gadget-list.md'
output = ''
output += '| Project | Gadget | Third-Party Domains |\n'
output += '| ------- | ------ | ------------------- |\n'
for gadget in gadgets:
    output += f"| {gadget['project']} | {gadget['gadget']} | {gadget['third_party_domains']} |\n"

with open(output_file, 'w', encoding='utf-8') as f:
    f.write(output)

print(output + '\n')
print(f"Markdown table saved to {output_file}")
