diff --git a/data/update-data.py b/data/update-data.py
index 5cfac8d7..defa3c50 100755
--- a/data/update-data.py
+++ b/data/update-data.py
@@ -9,17 +9,21 @@
#
import os
+import io
import json
import requests
import subprocess
import yaml
from datetime import date
+from lxml import etree
from tempfile import TemporaryDirectory
IANA_TLD_LIST_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt'
SPDX_REPO_URL = 'https://github.com/spdx/license-list-data.git'
MENU_SPEC_URL = 'https://gitlab.freedesktop.org/xdg/xdg-specs/raw/master/menu/menu-spec.xml'
+XPATH_MAIN_CATEGORIES = '/article/appendix[@id="category-registry"]/sect1[@id="main-category-registry"]//tbody/row/*[1]'
+XPATH_ADDITIONAL_CATEGORIES = '/article/appendix[@id="category-registry"]/sect1[@id="additional-category-registry"]//tbody/row/*[1]'
def update_tld_list(url, fname):
@@ -163,95 +167,18 @@ def write_platform_data(fname, values):
def update_categories_list(spec_url, cat_fname):
- ''' The worst parser ever, extracting category information directoly from the spec Docbook file '''
- from enum import Enum, auto
+ print('Updating XDG categories data...')
req = requests.get(spec_url)
-
- class SpecSection(Enum):
- NONE = auto()
- MAIN_CATS = auto()
- MAIN_CATS_BODY = auto()
- EXTRA_CATS = auto()
- EXTRA_CATS_BODY = auto()
-
- def get_entry(line):
- start = line.index('') + 7
- end = line.index('')
- return line[start:end].strip()
-
- main_cats = []
- extra_cats = []
-
- current_cat = {}
- spec_sect = SpecSection.NONE
- for line in str(req.content, 'utf-8').splitlines():
- if 'Main Category' in line:
- spec_sect = SpecSection.MAIN_CATS
- continue
-
- if 'Additional Category' in line:
- spec_sect = SpecSection.EXTRA_CATS
- continue
-
- if '
' in line:
- current_cat = {}
- if spec_sect == SpecSection.MAIN_CATS:
- spec_sect = SpecSection.MAIN_CATS_BODY
- else:
- spec_sect = SpecSection.EXTRA_CATS_BODY
- continue
-
- if spec_sect == SpecSection.MAIN_CATS_BODY:
- if '' in line:
- if current_cat:
- main_cats.append(current_cat)
- current_cat = {}
- continue
- if '
' in line:
- if current_cat:
- main_cats.append(current_cat)
- current_cat = {}
- spec_sect = SpecSection.NONE
- continue
-
- if '' in line:
- if current_cat.get('desc'):
- continue
- if current_cat:
- current_cat['desc'] = get_entry(line)
- else:
- current_cat['name'] = get_entry(line)
- continue
-
- if spec_sect == SpecSection.EXTRA_CATS_BODY:
- if '' in line:
- if current_cat:
- extra_cats.append(current_cat)
- current_cat = {}
- continue
- if '' in line:
- if current_cat:
- main_cats.append(current_cat)
- current_cat = {}
- spec_sect = SpecSection.NONE
- # nothing interesting follows for us after the additional categories are done
- break
-
- if '' in line:
- if current_cat.get('rel'):
- continue
- if current_cat:
- if not current_cat.get('desc'):
- current_cat['desc'] = get_entry(line)
- if not current_cat.get('rel'):
- current_cat['rel'] = get_entry(line)
- else:
- current_cat['name'] = get_entry(line)
- continue
-
- all_cat_names = [cat['name'] for cat in main_cats]
- all_cat_names.extend([cat['name'] for cat in extra_cats])
+ tree = etree.parse(io.BytesIO(req.content))
+
+ all_cat_names = []
+ entries = tree.xpath(XPATH_MAIN_CATEGORIES)
+ assert len(entries) > 0
+ all_cat_names.extend([e.text for e in entries])
+ entries = tree.xpath(XPATH_ADDITIONAL_CATEGORIES)
+ assert len(entries) > 0
+ all_cat_names.extend([e.text for e in entries])
all_cat_names.sort()
with open(cat_fname, 'w') as f: