#!/usr/local/anaconda3/bin/python

# After copy/pasting an Ancestry web page with DNA match data on it,
# this script cleans it up and reformats it.  Suppose you paste a DNA match
# page into someName.raw, you run this program by:
#
#   ./clean.py someName.raw
#
# and it creates a file, someName, with clean output like:
#
# name_of_person_with_dna_matches
# name1|cm1
# name2|cm2
# name3|cm3
# ...
#
# New files for each person in this file should be named as they appear in
# this file.  I.e., file with name name1 contains his common matches, name2
# with hers, and so on.
#
# Mike Markowski, mike.ab3ap@gmail.com
# May 2021

import re
import sys

def clean(dnaFile):
	try:
		f = open(dnaFile, 'r')
	except FileNotFoundError:
		return False # Fail, couldn't find match file.

	cleanFile = dnaFile[:dnaFile.find('.raw')]
	slash = cleanFile.rfind('/')
	dnaName = cleanFile[slash+1:]
	fOut = open(cleanFile, 'w')
	fOut.write('%s\n' % dnaName) # User name whose matches follow.

	fPrev = ''
	nameField = True
	viewMatch = 0
	for line in f:

		# Ignore three lines after 'View match' line.
		if line == '\n' and viewMatch == 1:
			viewMatch = 2
			continue
		elif viewMatch == 2:
			viewMatch = 3
			continue
		elif viewMatch == 3:
			viewMatch = 0

		filtered = filter(line)
		if fPrev == filtered: # Don't print duplicate lines.
			continue

		if filtered != '':
			if filtered.find('View match') > -1:
				viewMatch = 1
			elif nameField: # Ancestry name.
				fOut.write(filtered)
				nameField = False
			else: # Ancestry cM.
				m = re.search('[0-9]', filtered)
				digit = m.start()
				fOut.write('|%s\n' % filtered[digit:])
				nameField = True

		fPrev = filtered
	f.close()
	fOut.close()
	return True # Successful reformatting.

def filter(line):
	line = line.strip()

	# Any line containing these substrings is ignored.
	unwanted = [
		'1 person',
		'Add note',
		'Add to group',
		'Another Test',
		'anaged by',
		' ancestor',
		'Ancestry Blog',
		'AncestryCorporate',
		'backend services',
		'Careers',
		'Common ancestors',
		' Cousin',
		' DNA Matches',
		' Family',
		'Filter by',
		'Gift Memberships',
		'Groups',
		' linked ',
		'List Map',
		'Message',
		'No Trees',
		'Notes',
		'Parent',
		' People',
		'Private',
		'Search',
		'Shared',
		'Site Map',
		'Support Center',
		' unavailable',
		'Tools',
		' Trees',
		'Unlinked',
		'United States',
		'Unviewed',
		'Visit our other',
		'You and ',
	]

	# Filter out unwanted grunge from pasted web page.
	if len(line) <= 2:
		return ''
	for u in unwanted:
		if line.find(u) > -1:
			return ''

	# Pull out integer from cM line and ignore rest.
	cM = line.find(' cM')
	if cM > -1:
		line = line[:cM]
		line = line.replace(',', '') # Don't want commas in numbers.

	return line # Good stuff!  Name or cM.

if __name__ == '__main__':
	filename = sys.argv[1]
	if filename.find('.raw') == -1:
		print('File name %s does not end in .raw.' % filename)
	else:
		clean(filename)
