| Douglas Bagnall on Wed, 4 Sep 2019 11:49:17 +0200 (CEST) |
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
| Re: <nettime> The effect of "Nettime is in bad shape" on user agent ratios |
On 4/09/19 4:41 am, John Preston wrote:
> Thanks Douglas. I like this. I would like to play with this on a
> wider scale (listiverse). Do you have a script to scrape out the
> headers from the archived messages or something?
Not if you mean web archives. If you mean mbox files, then yes -- last
night's script is below. If you wish to compare time periods you will
need to cut up the mbox file yourself.
Douglas
----------------8<------countmuas.py-----------------------------------
#!/usr/bin/python3
"""Count user agent headers in mbox files
USAGE: python3 countmuas.py MBOX [MBOX [...]]
The order in which the results are presented depends on the overall
counts, thus the output of
countmuas.py A B C
is likely to look different from
countmuas.py A; countmuas.py B; countmuas.py C
User-agent and X-Mailer headers are used where available; further
heuristics attempt to distinguish webmail providers.
"""
import mailbox
import sys
from collections import Counter
import re
is_google = re.compile('^(x-gm-|x-google)', re.I).match
is_microsoft = re.compile('^(x-ms-|x-microsoft)', re.I).match
def count_user_agents(mbox):
m = mailbox.mbox(mbox)
headers = Counter()
for k, msg in m.items():
headers.update(x.lower() for x in msg.keys())
ua_counts = Counter()
for k, msg in m.items():
x = []
if any(is_microsoft(h) for h in msg.keys()):
x.append('microsoft')
if any(is_google(h) for h in msg.keys()):
x.append('gmail')
ua = msg.get('User-Agent')
if ua:
ua = re.sub(r'[\d.]\w$', '', ua)
ua = re.sub(r'\d\w?[\d.]*', '', ua)
x.append(ua)
xm = msg.get('X-Mailer')
if xm:
xm = re.sub(r'\d+[\d.]*\w?[\d.]*', '', xm)
x.append(xm)
s = '|'.join(x) or "unknown"
s = re.sub(r'\s+', ' ', s).strip()
ua_counts[s] += 1
clean = Counter()
for ua, count in ua_counts.most_common():
ua = re.sub(r'[^\w ]+', '', ua).lower()
ua = ua.strip()
if any( x in ua for x in ('ymailnorrin',
'aolwebmail',
'yahoomail')):
ua = 'yahoo/aol'
elif 'thunderbird' in ua:
for o in ('linux', 'macintosh', 'windows'):
if o in ua:
ua = 'Thunderbird (%s)' % o.title()
elif 'mew version on emacs' in ua:
ua = 'Mew (Emacs)'
elif 'cyrusjmap' in ua:
ua = 'Cyrus webmail'
elif 'jaro mail' in ua:
ua = 'Jaro Mail'
elif 'trojita' in ua:
ua = 'Trojita'
elif 'xsll' in ua:
ua = 'XS4all Webmail'
elif 'claws mail' in ua:
ua = 'Claws Mail'
elif ua in ('microsoft', 'microsoftgmail'):
ua = 'MS/Outlook.com/Hotmail'
elif ua == 'gmail':
ua = 'Gmail'
else:
ua = ua.replace('gmail', '')
ua = re.sub(' ?deb$', '', ua)
ua = re.sub(r' version\s*$', '', ua)
ua = ua.title()
clean[ua] += count
return clean
def print_user_agents(counts, names=None):
if names is None:
names = sorted(list(counts.keys()))
total = sum(counts.values())
for ua in names:
count = counts[ua]
percent = (count * 100.0 / total)
print("|%-30s %4.1f%% %s" % ('#' * (int(percent * 1 + 0.5) ),
percent, ua))
def main():
files = sys.argv[1:]
if {'-h', '--help'}.intersection(files):
print(__doc__)
sys.exit()
names = Counter()
mbox_counts = []
for mbox in files:
counts = count_user_agents(mbox)
mbox_counts.append(counts)
names.update(counts)
names = [x[0] for x in names.most_common()]
for filename, counts in zip(files, mbox_counts):
print('----- %s -----' % filename)
print_user_agents(counts, names)
main()
# distributed via <nettime>: no commercial use without permission
# <nettime> is a moderated mailing list for net criticism,
# collaborative text filtering and cultural politics of the nets
# more info: http://mx.kein.org/mailman/listinfo/nettime-l
# archive: http://www.nettime.org contact: nettime@kein.org
# @nettime_bot tweets mail w/ sender unless #ANON is in Subject: