1 | #! /usr/bin/env python |
---|
2 | import sys |
---|
3 | import os.path |
---|
4 | import fnmatch |
---|
5 | import re |
---|
6 | from xml.dom import minidom |
---|
7 | import time |
---|
8 | |
---|
9 | def listFiles(root, patterns='*', recurse=1, return_folders=0): |
---|
10 | # Expand patterns from semicolon-separated string to list |
---|
11 | pattern_list = patterns.split(';') |
---|
12 | # Collect input and output arguments into one bunch |
---|
13 | class Bunch: |
---|
14 | def __init__(self, **kwds): |
---|
15 | self.__dict__.update(kwds) |
---|
16 | |
---|
17 | arg = Bunch(recurse=recurse, pattern_list=pattern_list, |
---|
18 | return_folders=return_folders, results=[]) |
---|
19 | |
---|
20 | def visit(arg, dirname, files): |
---|
21 | # Append to arg.results all relevant files (and perhaps folders) |
---|
22 | for name in files: |
---|
23 | fullname = os.path.normpath(os.path.join(dirname, name)) |
---|
24 | if arg.return_folders or os.path.isfile(fullname): |
---|
25 | for pattern in arg.pattern_list: |
---|
26 | if fnmatch.fnmatch(name, pattern): |
---|
27 | arg.results.append(fullname) |
---|
28 | break |
---|
29 | # Block recursion if recursion was disallowed |
---|
30 | if not arg.recurse: files[:]=[] |
---|
31 | |
---|
32 | os.path.walk(root, visit, arg) |
---|
33 | |
---|
34 | return arg.results |
---|
35 | |
---|
36 | def getPortalType(doc): |
---|
37 | elem = doc.getElementsByTagName('cmf:type')[0] |
---|
38 | cmftype = str(elem.firstChild.nodeValue.strip()) |
---|
39 | return cmftype |
---|
40 | |
---|
41 | def checkPortalType(doc, type_name): |
---|
42 | if getPortalType(doc) == type_name: |
---|
43 | return True |
---|
44 | else: |
---|
45 | return False |
---|
46 | |
---|
47 | |
---|
48 | href = re.compile(r'href="([^"]+)"') |
---|
49 | src = re.compile(r'src="([^"]+)"') |
---|
50 | |
---|
51 | SITE_URLS = [ |
---|
52 | 'http://somesite.com', |
---|
53 | 'http://www.somesite.com/' |
---|
54 | ] |
---|
55 | |
---|
56 | def isLocal(url): |
---|
57 | if url.startswith('http://') or url.startswith('file://'): |
---|
58 | for site in SITE_URLS: |
---|
59 | if url.startswith(site): |
---|
60 | return True |
---|
61 | else: |
---|
62 | return True |
---|
63 | return False |
---|
64 | |
---|
65 | def getLinks(doc, field): |
---|
66 | try: |
---|
67 | elem = [i for i in doc.getElementsByTagName('field') if i.getAttribute('name') == field][0] |
---|
68 | except IndexError: |
---|
69 | return [] |
---|
70 | text = elem.firstChild.nodeValue |
---|
71 | #urls = href.findall(text) |
---|
72 | urls = src.findall(text) |
---|
73 | urls = filter(isLocal, urls) |
---|
74 | |
---|
75 | return urls |
---|
76 | |
---|
77 | def getLinkStats(dirname='.', prefix=None, verbose=False, sort_on_entry=False): |
---|
78 | entries = {} |
---|
79 | |
---|
80 | if prefix is not None: |
---|
81 | pflen = len(prefix) |
---|
82 | |
---|
83 | replace_sep = os.path.sep == '/' and False or True |
---|
84 | |
---|
85 | files = listFiles(dirname, patterns='.marshall.xml') |
---|
86 | for f in files: |
---|
87 | doc = minidom.parse(f) |
---|
88 | if not checkPortalType(doc, 'BlogEntry'): |
---|
89 | continue |
---|
90 | |
---|
91 | links = getLinks(doc, 'body') |
---|
92 | |
---|
93 | if prefix and f.startswith(prefix): |
---|
94 | f = f[pflen:] |
---|
95 | f = os.path.dirname(f) |
---|
96 | if replace_sep: |
---|
97 | f = f.replace('\\', '/') |
---|
98 | if links: |
---|
99 | entries[f] = links |
---|
100 | |
---|
101 | links = [] |
---|
102 | for value in entries.values(): |
---|
103 | links.extend(value) |
---|
104 | absolute = [i for i in links if '://' in i] |
---|
105 | absolute.sort() |
---|
106 | relative = [i for i in links if i not in absolute] |
---|
107 | relative.sort() |
---|
108 | |
---|
109 | print '%d total links' % len(links) |
---|
110 | if verbose: |
---|
111 | if sort_on_entry: |
---|
112 | print '%d absolute links' % len(absolute) |
---|
113 | print '%d relative links' % len(relative) |
---|
114 | keys = entries.keys() |
---|
115 | keys.sort() |
---|
116 | for k in keys: |
---|
117 | print k |
---|
118 | for i in entries[k]: |
---|
119 | print '\t%s' % i |
---|
120 | else: |
---|
121 | print '%d absolute links' % len(absolute) |
---|
122 | for l in absolute: |
---|
123 | print '\t%s' % l |
---|
124 | print '%d relative links' % len(relative) |
---|
125 | for l in relative: |
---|
126 | print '\t%s' % l |
---|
127 | |
---|
128 | return entries |
---|
129 | |
---|
130 | def getLostContentStats(dirname='.', prefix=None, verbose=False): |
---|
131 | entries = [] |
---|
132 | |
---|
133 | if prefix is not None: |
---|
134 | pflen = len(prefix) |
---|
135 | |
---|
136 | replace_sep = os.path.sep == '/' and False or True |
---|
137 | |
---|
138 | files = listFiles(dirname, patterns='.marshall.xml') |
---|
139 | for f in files: |
---|
140 | doc = minidom.parse(f) |
---|
141 | if not checkPortalType(doc, 'BlogEntry'): |
---|
142 | continue |
---|
143 | entries.append(f) |
---|
144 | |
---|
145 | content = {} |
---|
146 | for e in entries: |
---|
147 | files = listFiles(os.path.dirname(e), patterns='.objects.xml') |
---|
148 | |
---|
149 | if prefix and e.startswith(prefix): |
---|
150 | e = e[pflen:] |
---|
151 | e = os.path.dirname(e) |
---|
152 | if replace_sep: |
---|
153 | e = e.replace('\\', '/') |
---|
154 | |
---|
155 | for f in files: |
---|
156 | doc = minidom.parse(f) |
---|
157 | for r in doc.getElementsByTagName('record'): |
---|
158 | item_path = '/'.join([e, r.firstChild.nodeValue]) |
---|
159 | content.setdefault(r.getAttribute('type'), []).append(item_path) |
---|
160 | |
---|
161 | print "Lost content in blog entries under %s:" % dirname |
---|
162 | count = 0 |
---|
163 | keys = content.keys() |
---|
164 | keys.sort() |
---|
165 | for k in keys: |
---|
166 | l = len(content[k]) |
---|
167 | count += l |
---|
168 | print " %s: %s" % (k, l) |
---|
169 | if verbose: |
---|
170 | for p in content[k]: |
---|
171 | print '\t%s' % p |
---|
172 | print "%d total" % count |
---|
173 | |
---|
174 | def getContentStats(dirname='.', prefix=None, verbose=False): |
---|
175 | content = {} |
---|
176 | |
---|
177 | if prefix is not None: |
---|
178 | pflen = len(prefix) |
---|
179 | |
---|
180 | replace_sep = os.path.sep == '/' and False or True |
---|
181 | |
---|
182 | files = listFiles(dirname, patterns='.marshall.xml') |
---|
183 | for f in files: |
---|
184 | doc = minidom.parse(f) |
---|
185 | cmftype = getPortalType(doc) |
---|
186 | |
---|
187 | if prefix and f.startswith(prefix): |
---|
188 | f = f[pflen:] |
---|
189 | f = os.path.dirname(f) |
---|
190 | if replace_sep: |
---|
191 | f = f.replace('\\', '/') |
---|
192 | |
---|
193 | content.setdefault(cmftype, []).append(f) |
---|
194 | |
---|
195 | print "Content under %s:" % dirname |
---|
196 | count = 0 |
---|
197 | keys = content.keys() |
---|
198 | keys.sort() |
---|
199 | for k in keys: |
---|
200 | l = len(content[k]) |
---|
201 | count += l |
---|
202 | print " %s: %s" % (k, l) |
---|
203 | if verbose: |
---|
204 | for p in content[k]: |
---|
205 | print '\t%s' % p |
---|
206 | print "%d total" % count |
---|
207 | |
---|
208 | ACTIONS = { |
---|
209 | 'all': getContentStats, |
---|
210 | 'lost': getLostContentStats, |
---|
211 | 'links': getLinkStats, |
---|
212 | } |
---|
213 | |
---|
214 | if __name__ == "__main__": |
---|
215 | if len(sys.argv) > 1 and sys.argv[1].split('-', 1)[0] in ACTIONS: |
---|
216 | action = sys.argv[1].split('-', 1) |
---|
217 | if len(action) > 1: |
---|
218 | action, mod = action |
---|
219 | verbose = 'v' in mod and True or False |
---|
220 | sort_on_entry = 's' in mod and True or False |
---|
221 | else: |
---|
222 | action = action[0] |
---|
223 | verbose = False |
---|
224 | sort_on_entry = False |
---|
225 | else: |
---|
226 | print "Need one of next action as argument: %s" % ACTIONS.keys() |
---|
227 | sys.exit(1) |
---|
228 | if len(sys.argv) > 2: |
---|
229 | path = sys.argv[2] |
---|
230 | else: |
---|
231 | path = '.' |
---|
232 | if len(sys.argv) > 3 : |
---|
233 | prefix = sys.argv[3] |
---|
234 | elif path != '.': |
---|
235 | prefix = path |
---|
236 | else: |
---|
237 | prefix = None |
---|
238 | |
---|
239 | start = time.time() |
---|
240 | |
---|
241 | if action == 'links': |
---|
242 | ACTIONS[action](path, prefix, verbose, sort_on_entry) |
---|
243 | else: |
---|
244 | ACTIONS[action](path, prefix, verbose) |
---|
245 | |
---|
246 | end = time.time() |
---|
247 | print "%d second elapsed" % (end-start, ) |
---|