Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

blogstats.py @ 1250

Last change on this file since 1250 was 1250, checked in by koval, 15 years ago
fixed moving of blog entry contents, added script for getting exported content stats
File size: 6.7 KB

Line
1	#! /usr/bin/env python
2	import sys
3	import os.path
4	import fnmatch
5	import re
6	from xml.dom import minidom
7	import time
8
9	def listFiles(root, patterns='*', recurse=1, return_folders=0):
10	# Expand patterns from semicolon-separated string to list
11	pattern_list = patterns.split(';')
12	# Collect input and output arguments into one bunch
13	class Bunch:
14	def __init__(self, **kwds):
15	self.__dict__.update(kwds)
16
17	arg = Bunch(recurse=recurse, pattern_list=pattern_list,
18	return_folders=return_folders, results=[])
19
20	def visit(arg, dirname, files):
21	# Append to arg.results all relevant files (and perhaps folders)
22	for name in files:
23	fullname = os.path.normpath(os.path.join(dirname, name))
24	if arg.return_folders or os.path.isfile(fullname):
25	for pattern in arg.pattern_list:
26	if fnmatch.fnmatch(name, pattern):
27	arg.results.append(fullname)
28	break
29	# Block recursion if recursion was disallowed
30	if not arg.recurse: files[:]=[]
31
32	os.path.walk(root, visit, arg)
33
34	return arg.results
35
36	def getPortalType(doc):
37	elem = doc.getElementsByTagName('cmf:type')[0]
38	cmftype = str(elem.firstChild.nodeValue.strip())
39	return cmftype
40
41	def checkPortalType(doc, type_name):
42	if getPortalType(doc) == type_name:
43	return True
44	else:
45	return False
46
47
48	href = re.compile(r'href="([^"]+)"')
49	src = re.compile(r'src="([^"]+)"')
50
51	SITE_URLS = [
52	'http://somesite.com',
53	'http://www.somesite.com/'
54	]
55
56	def isLocal(url):
57	if url.startswith('http://') or url.startswith('file://'):
58	for site in SITE_URLS:
59	if url.startswith(site):
60	return True
61	else:
62	return True
63	return False
64
65	def getLinks(doc, field):
66	try:
67	elem = [i for i in doc.getElementsByTagName('field') if i.getAttribute('name') == field][0]
68	except IndexError:
69	return []
70	text = elem.firstChild.nodeValue
71	#urls = href.findall(text)
72	urls = src.findall(text)
73	urls = filter(isLocal, urls)
74
75	return urls
76
77	def getLinkStats(dirname='.', prefix=None, verbose=False, sort_on_entry=False):
78	entries = {}
79
80	if prefix is not None:
81	pflen = len(prefix)
82
83	replace_sep = os.path.sep == '/' and False or True
84
85	files = listFiles(dirname, patterns='.marshall.xml')
86	for f in files:
87	doc = minidom.parse(f)
88	if not checkPortalType(doc, 'BlogEntry'):
89	continue
90
91	links = getLinks(doc, 'body')
92
93	if prefix and f.startswith(prefix):
94	f = f[pflen:]
95	f = os.path.dirname(f)
96	if replace_sep:
97	f = f.replace('\\', '/')
98	if links:
99	entries[f] = links
100
101	links = []
102	for value in entries.values():
103	links.extend(value)
104	absolute = [i for i in links if '://' in i]
105	absolute.sort()
106	relative = [i for i in links if i not in absolute]
107	relative.sort()
108
109	print '%d total links' % len(links)
110	if verbose:
111	if sort_on_entry:
112	print '%d absolute links' % len(absolute)
113	print '%d relative links' % len(relative)
114	keys = entries.keys()
115	keys.sort()
116	for k in keys:
117	print k
118	for i in entries[k]:
119	print '\t%s' % i
120	else:
121	print '%d absolute links' % len(absolute)
122	for l in absolute:
123	print '\t%s' % l
124	print '%d relative links' % len(relative)
125	for l in relative:
126	print '\t%s' % l
127
128	return entries
129
130	def getLostContentStats(dirname='.', prefix=None, verbose=False):
131	entries = []
132
133	if prefix is not None:
134	pflen = len(prefix)
135
136	replace_sep = os.path.sep == '/' and False or True
137
138	files = listFiles(dirname, patterns='.marshall.xml')
139	for f in files:
140	doc = minidom.parse(f)
141	if not checkPortalType(doc, 'BlogEntry'):
142	continue
143	entries.append(f)
144
145	content = {}
146	for e in entries:
147	files = listFiles(os.path.dirname(e), patterns='.objects.xml')
148
149	if prefix and e.startswith(prefix):
150	e = e[pflen:]
151	e = os.path.dirname(e)
152	if replace_sep:
153	e = e.replace('\\', '/')
154
155	for f in files:
156	doc = minidom.parse(f)
157	for r in doc.getElementsByTagName('record'):
158	item_path = '/'.join([e, r.firstChild.nodeValue])
159	content.setdefault(r.getAttribute('type'), []).append(item_path)
160
161	print "Lost content in blog entries under %s:" % dirname
162	count = 0
163	keys = content.keys()
164	keys.sort()
165	for k in keys:
166	l = len(content[k])
167	count += l
168	print " %s: %s" % (k, l)
169	if verbose:
170	for p in content[k]:
171	print '\t%s' % p
172	print "%d total" % count
173
174	def getContentStats(dirname='.', prefix=None, verbose=False):
175	content = {}
176
177	if prefix is not None:
178	pflen = len(prefix)
179
180	replace_sep = os.path.sep == '/' and False or True
181
182	files = listFiles(dirname, patterns='.marshall.xml')
183	for f in files:
184	doc = minidom.parse(f)
185	cmftype = getPortalType(doc)
186
187	if prefix and f.startswith(prefix):
188	f = f[pflen:]
189	f = os.path.dirname(f)
190	if replace_sep:
191	f = f.replace('\\', '/')
192
193	content.setdefault(cmftype, []).append(f)
194
195	print "Content under %s:" % dirname
196	count = 0
197	keys = content.keys()
198	keys.sort()
199	for k in keys:
200	l = len(content[k])
201	count += l
202	print " %s: %s" % (k, l)
203	if verbose:
204	for p in content[k]:
205	print '\t%s' % p
206	print "%d total" % count
207
208	ACTIONS = {
209	'all': getContentStats,
210	'lost': getLostContentStats,
211	'links': getLinkStats,
212	}
213
214	if __name__ == "__main__":
215	if len(sys.argv) > 1 and sys.argv[1].split('-', 1)[0] in ACTIONS:
216	action = sys.argv[1].split('-', 1)
217	if len(action) > 1:
218	action, mod = action
219	verbose = 'v' in mod and True or False
220	sort_on_entry = 's' in mod and True or False
221	else:
222	action = action[0]
223	verbose = False
224	sort_on_entry = False
225	else:
226	print "Need one of next action as argument: %s" % ACTIONS.keys()
227	sys.exit(1)
228	if len(sys.argv) > 2:
229	path = sys.argv[2]
230	else:
231	path = '.'
232	if len(sys.argv) > 3 :
233	prefix = sys.argv[3]
234	elif path != '.':
235	prefix = path
236	else:
237	prefix = None
238
239	start = time.time()
240
241	if action == 'links':
242	ACTIONS[action](path, prefix, verbose, sort_on_entry)
243	else:
244	ACTIONS[action](path, prefix, verbose)
245
246	end = time.time()
247	print "%d second elapsed" % (end-start, )

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: products/quintagroup.transmogrifier.simpleblog2quills/branches/without_image_move/quintagroup/transmogrifier/simpleblog2quills/blogstats.py @ 1250

Download in other formats: