Python
Børre Stenseth

XML

Hva

Python har en rimelig greitt verktøykasse for å parse og inspisere XML-filer. Vi skal se på noen muligheter.

Datagrunnlaget er fila all_results.xml med resultater fra sprintøvelsene i olymiske leker de siste årene.

Du vil se av koden nedenfor at metodene find() og findall() er sentrale ii å lokalisere noder i treet. En parameter er et forenklet XPath [1] uttrykk. Mer om XPath i modulen lxml . Og enda mer i materalet om XSLT

Vi tar utgangspunkt i følgende Python modul.

"""
Reading xml, and report 
"""
import sys
import xml.etree.ElementTree as TRE
 
"""
Load data file and establish tree
return the rootnode
"""
def loadIt(filename):
    try:
        tree = TRE.parse(filename)
        return tree.getroot()
    except:
        res=sys.exc_info()
        print (res[1])
        return None
 
"""
All athlet names, unsorted as found
"""
def allAthletNames(root):
    all_athlets=root.findall('OlympicGame/event/athlet/name')
    for at in all_athlets:
        print(at.text)
 
"""
All olympics, place and year
"""
def allOlympics(root):
    all_games=root.findall('OlympicGame')
    for ga in all_games:
        print(ga.attrib['place'],ga.attrib['year'])
 
"""
Athlets running a certain dist in a certail year
Using reduced XPAth
"""
def athletsInDistYear(root, dist, year):
    print(dist,year)
    S=".//OlympicGame[@year='%s']/event[@dist='%s']/athlet/name"%(year,dist)
    athletNames=root.findall(S)
    for at in athletNames:
        print(at.text)
 
"""
Athlets running a certain dist in a certail year, ordered by result
Using reduced XPAth
"""
def timeused(t):
    return float(t[1])
def athletsOrderedInDistYear(root, dist, year):
    print(dist,year)
    order={}
    S=".//OlympicGame[@year='%s']/event[@dist='%s']/athlet"%(year,dist)
    athlets=root.findall(S)
    for at in athlets:
        order[at.find("./name").text] = at.find("./result").text
    items=order.items()
    sortedtems=sorted(items,key=timeused)
    for it in sortedtems:
        print(it[0],it[1])
 
"""
Best result of athlet in a distance, regardless of year
Using reduced XPAth
"""
def athletsOrderedInDist(root, dist):
    print(dist,"all olympics")
    order={}
    S=".//event[@dist='%s']/athlet"%(dist)
    athlets=root.findall(S)
    for at in athlets:
        name=at.find("./name").text
        res=at.find("./result").text
        if (not name in order) or (float(order[name]) > float(res)):
            order[name] = res
    items=order.items()
    sortedtems=sorted(items,key=timeused)
    for it in sortedtems:
        print(it[0],it[1])
"""
Best result of athlets in a distance (where and when) , regardless of year
Using reduced XPAth
"""
def timeused2(t):
    return float(t[1][0])
def athletsOrderedInDistWhere(root, dist):
    print(dist,"all olympics")
    order={}
    games=root.findall(".//OlympicGame")
    for og in games:
        athlets=og.findall(".//event[@dist='%s']/athlet"%dist)
        for at in athlets:
            name=at.find("./name").text
            res=at.find("./result").text
            if (not name in order) or (float(order[name][0]) > float(res)):
                order[name] = [res,og.attrib['place'],og.attrib['year']]
    items=order.items()
    sortedtems=sorted(items,key=timeused2)
    for it in sortedtems:
        print(it[1][0],'\t',it[0],',',it[1][1],it[1][2])
  
root= loadIt('all_results.xml')
if root !=None:    
    #allAthletNames(root)
    #allOlympics(root)
    #athletsInDistYear(root, '400m', '2004')
    #athletsOrderedInDistYear(root, '100m', '1992')
    #athletsOrderedInDist(root, '100m')
    athletsOrderedInDistWhere(root, '100m')

Resultatet av metoden athletsOrderedInDistWhere() blir:

100m all olympics
09.63 	 Usain Bolt , London 2012
09.75 	 Yohan Blake , London 2012
09.79 	 Justin Gatlin , London 2012
09.80 	 Tyson Gay , London 2012
09.84 	 Donovan Bailey , Atlanta 1996
09.86 	 Francis Obikwelu , Athens 2004
09.87 	 Maurice Greene , Sidney 2000
09.88 	 Ryan Bailey , London 2012
09.89 	 Shawn Crawford , Athens 2004
09.89 	 Frank Fredericks , Atlanta 1996
09.89 	 Richard Thompson , Beijing 2008
09.90 	 Ato Boldon , Atlanta 1996
09.91 	 Walter Dix , Beijing 2008
09.93 	 Churandy Martina , Beijing 2008
09.94 	 Asafa Powell , Athens 2004
09.96 	 Linford Christie , Barcelona 1992
09.97 	 Michael Frater , Beijing 2008
09.99 	 Dennis Mitchell , Atlanta 1996
10.00 	 Michael Marsh , Atlanta 1996
10.00 	 Kim Collins , Athens 2004
10.01 	 Marc Burns , Beijing 2008
10.03 	 Darvis Patton , Beijing 2008
10.04 	 Obadele Thompson , Sidney 2000
10.08 	 Dwain Chambers , Sidney 2000
10.09 	 Jonathan Drummond , Sidney 2000
10.09 	 Bruny Surin , Barcelona 1992
10.10 	 Leroy Burrell , Barcelona 1992
10.12 	 Olapade Adeniken , Barcelona 1992
10.13 	 Darren Campbell , Sidney 2000
10.14 	 Davidson Ezima , Atlanta 1996
10.16 	 Michael Green , Atlanta 1996
10.22 	 Raymond Stewart , Barcelona 1992

Kopier Pythonkoden og datafila som nevnt over, kjør de forskjellige metoden og gjør dine egne eksperimenter.

Referanser
  1. XPath 1.0 W3C www.w3.org/TR/xpath 14-03-2014