# HG changeset patch # User Gustavo Picon # Date 1270792269 18000 # Node ID e7594eb1b0aafa3f9e52b3325c13f00b21888c6e # Parent b03f4b528ea19c5b2a174d45368e61f46eb14591 improved behavior of str2tags, added normalization diff -r b03f4b528ea19c5b2a174d45368e61f46eb14591 -r e7594eb1b0aafa3f9e52b3325c13f00b21888c6e tagtools.py --- a/tagtools.py Thu Apr 08 23:39:36 2010 -0500 +++ b/tagtools.py Fri Apr 09 00:51:09 2010 -0500 @@ -1,20 +1,35 @@ -import unicodedata +""" tagtools +""" + class Serializer(object): + """ TODO: docstring + """ SEPARATOR = JOINER = TAGS_WITH_SPACES = None @classmethod def str2tags(cls, tagstr): + """ TODO: docstring + """ if not tagstr: return [] - return [ - tag.strip() - for tag in tagstr.split(cls.SEPARATOR) - if tag.strip() - ] + tags, keys = [], set() + for tag in tagstr.split(cls.SEPARATOR): + tag = tag.strip() + cleantag = cls.normalize(tag) + if not cleantag or cleantag in keys: + # Ignore if the normalized tag is empty or if there is + # already tag with the same normalized value. + # TaG, TAG, tag, taG ==> TaG + continue + tags.append((cleantag, tag)) + keys.add(cleantag) + return tags @classmethod def tags2str(cls, tags): + """ TODO: docstring + """ if cls.TAGS_WITH_SPACES: return cls.JOINER.join(tags) results = [] @@ -25,28 +40,53 @@ results.append(tag) return cls.JOINER.join(results) + @staticmethod + def normalize(tag): + """ TODO: docstring + """ + return tag.lower() + class DeliciousSerializer(Serializer): + """ TODO: docstring + """ SEPARATOR = JOINER = ' ' TAGS_WITH_SPACES = False class CommaSerializer(Serializer): + """ TODO: docstring + """ SEPARATOR = ',' JOINER = ', ' TAGS_WITH_SPACES = True class FlickrSerializer(Serializer): + """ TODO: docstring + """ + SEPARATOR = ' ' @classmethod def str2tags(cls, tagstr): + """ TODO: docstring + """ if not tagstr: return [] if '"' not in tagstr: - return [tag.strip() for tag in tagstr.split(' ') if tag.strip()] + return super(FlickrSerializer, cls).str2tags(tagstr) lstr = list(tagstr.strip()) - results, tok, prev, quoted = [], '', '', False + tags, keys, tok, prev, quoted = [], set(), '', '', False + + def addtok(tok): + """ TODO: docstring + """ + tok = tok.strip() + cleantok = cls.normalize(tok) + if cleantok and cleantok not in keys: + tags.append((cleantok, tok)) + keys.add(cleantok) + while lstr: char = lstr[0] if char == '"': @@ -56,7 +96,7 @@ (quoted and prev == '"' and '"' not in lstr)): if tok: quoted = False - results.append(tok.strip()) + addtok(tok) tok = '' else: tok += char @@ -64,25 +104,19 @@ del lstr[0] tok = tok.strip() if tok: - results.append(tok) - return results + addtok(tok) + return tags @classmethod def tags2str(cls, tags): + """ TODO: docstring + """ return ' '.join([ '"%s"' % tag if ' ' in tag else tag for tag in tags]) -def normalize(tags): - results = {} - for tag in tags: - cleantag = unicodedata.tag.strip().lower() - results[tag] = cleantag - return results - - - - class TagWithSpaceException(Exception): + """ TODO: docstring + """ pass diff -r b03f4b528ea19c5b2a174d45368e61f46eb14591 -r e7594eb1b0aafa3f9e52b3325c13f00b21888c6e tests.py --- a/tests.py Thu Apr 08 23:39:36 2010 -0500 +++ b/tests.py Fri Apr 09 00:51:09 2010 -0500 @@ -20,32 +20,45 @@ test('', []) test(' ', []) test(' ', []) - test('t1', ['t1']) - test(' t1', ['t1']) - test('t1 ', ['t1']) - test('t1 t2 t3', ['t1', 't2', 't3']) - test(' t1 t2 t3 ', ['t1', 't2', 't3']) - test('"t1"', ['t1']) - test(' "t1"', ['t1']) - test('"t1" ', ['t1']) - test(' "t1" ', ['t1']) - test('t1 "t2" t3', ['t1', 't2', 't3']) - test(' "t1" "t2" "t3" ', ['t1', 't2', 't3']) - test(' t"1 t"2 t"3 ', ['t1 t2', 't3']) - test(' ta"g1 "tag number 2" tag3 ', - ['tag1 tag', 'number', '2', 'tag3']) - test(' ta"g"1 "tag number 2" tag3 ', - ['tag1', 'tag number 2', 'tag3']) - test(' t"a"g"1 "tag number 2" tag3 ', - ['tag1 tag', 'number', '2', 'tag3']) + test('T1', [('t1', 'T1')]) + test(' T1', [('t1', 'T1')]) + test('T1 ', [('t1', 'T1')]) + test('T1 T2 T3', + [('t1', 'T1'), ('t2', 'T2'), ('t3', 'T3')]) + test(' T1 T2 T3 ', + [('t1', 'T1'), ('t2', 'T2'), ('t3', 'T3')]) + test('"T1"', [('t1', 'T1')]) + test(' "T1"', [('t1', 'T1')]) + test('"T1" ', [('t1', 'T1')]) + test(' "T1" ', [('t1', 'T1')]) + test('T1 "T2" T3', [('t1', 'T1'), ('t2', 'T2'), ('t3', 'T3')]) + test(' "T1" "T2" "T3" ', + [('t1', 'T1'), ('t2', 'T2'), ('t3', 'T3')]) + test(' T"1 T"2 T"3 ', + [('t1 t2', 'T1 T2'), ('t3', 'T3')]) + test(' Ta"G1 "tAg nUmber 2" taG3 ', + [('tag1 tag', 'TaG1 tAg'), ('number', 'nUmber'), + ('2', '2'), ('tag3', 'taG3')]) + test(' Ta"G"1 "tAg numbEr 2" taG3 ', + [('tag1', 'TaG1'), ('tag number 2', + 'tAg numbEr 2'), ('tag3', 'taG3')]) + test(' t"A"G"1 "tAg nUMber 2" tAG3 ', + [('tag1 tag', 'tAG1 tAg'), ('number', 'nUMber'), + ('2', '2'), ('tag3', 'tAG3')]) test(' t"a"g"1 ""tag number 2" tag3 ', - ['tag1 tag number 2', 'tag3']) + [('tag1 tag number 2', 'tag1 tag number 2'), + ('tag3', 'tag3')]) test(' ta"g"1 "ta"g nu"mber 2" tag3 ', - ['tag1', 'tag', 'number 2', 'tag3']) + [('tag1', 'tag1'), ('tag', 'tag'), ('number 2', 'number 2'), + ('tag3', 'tag3')]) test(' ta"g"1 "ta"g nu"mber" 2" tag3 " ', - ['tag1', 'tag', 'number', '2 tag3']) + [('tag1', 'tag1'), ('tag', 'tag'), ('number', 'number'), + ('2 tag3', '2 tag3')]) test(' " t"a"""g"1 "ta"g ""nu"mber" 2"""" tag3 " "" ', - ['tag1', 'tag', 'number', '2', 'tag3']) + [('tag1', 'tag1'), ('tag', 'tag'), ('number', 'number'), + ('2', '2'), ('tag3', 'tag3')]) + test('TaG taG GAT tag gat', [('tag', 'TaG'), ('gat', 'GAT')]) + test('"TaG" taG GAT "tag" g"a"t', [('tag', 'TaG'), ('gat', 'GAT')]) def test_flickr_tags2str(self): def test(tags, expected): @@ -80,11 +93,13 @@ test('', []) test(' ', []) test(' ', []) - test('t1', ['t1']) - test(' t1', ['t1']) - test('t1 ', ['t1']) - test('t1 t2 t3', ['t1', 't2', 't3']) - test(' t1 t2 t3 ', ['t1', 't2', 't3']) + test('T1', [('t1', 'T1')]) + test(' T1', [('t1', 'T1')]) + test('T1 ', [('t1', 'T1')]) + test('T1 T2 T3', [('t1', 'T1'), ('t2', 'T2'), ('t3', 'T3')]) + test(' T1 T2 T3 ', + [('t1', 'T1'), ('t2', 'T2'), ('t3', 'T3')]) + test('TaG taG GAT tag gat', [('tag', 'TaG'), ('gat', 'GAT')]) def test_delicious_tags2str(self): def test(tags, expected): @@ -105,15 +120,17 @@ test('', []) test(',', []) test(',,,,,,', []) - test('t1', ['t1']) - test(' t 1 ', ['t 1']) - test(',,,,,,,t1', ['t1']) - test(',,,,,,, t 1 ', ['t 1']) - test('t1,,,,,,,,,', ['t1']) - test('t1,t2,t3', ['t1', 't2', 't3']) - test(',,,t1,,,,t2,,,,t3,,,', ['t1', 't2', 't3']) - test(',,,t 1,,,,t 2,,,,t 3,,,', - ['t 1', 't 2', 't 3']) + test('T1', [('t1', 'T1')]) + test(' T 1 ', [('t 1', 'T 1')]) + test(',,,,,,,T1', [('t1', 'T1')]) + test(',,,,,,, T 1 ', [('t 1', 'T 1')]) + test('T1,,,,,,,,,', [('t1', 'T1')]) + test('T1,T2,T3', [('t1', 'T1'), ('t2', 'T2'), ('t3', 'T3')]) + test(',,,T1,,,,T2,,,,T3,,,', + [('t1', 'T1'), ('t2', 'T2'), ('t3', 'T3')]) + test(',,,T 1,,,,T 2,,,,T 3,,,', + [('t 1', 'T 1'), ('t 2', 'T 2'), ('t 3', 'T 3')]) + test('TaG,taG,GAT,tag,gat', [('tag', 'TaG'), ('gat', 'GAT')]) def test_comma_tags2str(self): def test(tags, expected):