-
Notifications
You must be signed in to change notification settings - Fork 0
/
ErrorExtractor.py
124 lines (110 loc) · 4.28 KB
/
ErrorExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# coding=UTF-8
import difflib
from collections import deque
import Utils
errorBuffer = []
context = None
def appendRev(textList, oldRev, newRev):
"""Searches the textList. If any list (within textList) ends with oldRev then appeds newRev at it's end.
Otherwise it creates new list and inserts it in the textList; oldRev & newRev are ids at errorBuffer"""
if(len(errorBuffer[newRev][1]) > 100): #Appennded sentences are too long - don't add them as errors
return
founded = False
for k in range(0, len(textList)):
if(textList[k][-1]==oldRev):
textList[k] = textList[k] + (newRev,)
founded = True
break
if not founded:
if(len(errorBuffer[oldRev][0]) > 100 or len(errorBuffer[oldRev][1]) > 100): #Appennded sentences are too long - don't add them as errors
return
textList.append((oldRev, newRev))
def removeNestedErrors():
global errorBuffer
for i in range(0, len(errorBuffer)):
sentenceVersions = errorBuffer[i]
if(len(sentenceVersions) == 2):
break
errorBuffer[i] = [sentenceVersions[0], sentenceVersions[-1]]
def resolveEvolution():
"""Resolves evolution -
Changes structure of corp buffer to list of evolution lists:
[[[allComments, baseSentence], [comment, edit], ...], ...]"""
global errorBuffer
evolutionLinks = []
toRemove = set()
for i in range(0, len(errorBuffer)):
comparement = errorBuffer[i][2]
for j in range(i, len(errorBuffer)):
if(comparement == errorBuffer[j][1]):
toRemove.add(errorBuffer[i])
toRemove.add(errorBuffer[j])
appendRev(evolutionLinks, i, j)
break
evolutionDeques = []
for ev in evolutionLinks:
queue = []
oldSentenceList = ["", errorBuffer[ev[0]][1]]
queue.append(oldSentenceList) #Append first oldest sentence
allComments = ""
for l in ev:
allComments += errorBuffer[l][0] + "<separator>"
newList = [errorBuffer[l][0], errorBuffer[l][2]]
queue.append(newList) #Append newer versions of sentence
oldSentenceList = newList
queue[0][0] = allComments
evolutionDeques.append(queue)
evolutionDeques = sorted(evolutionDeques, key=len, reverse=True)
#Normalize errorBuffer to have the same structure as evolution deques & remove matched evolution sentences
errorBuffer = [[[x[0], x[1]], [x[0], x[2]]] for x in errorBuffer if x not in toRemove]
errorBuffer = evolutionDeques + errorBuffer
def processStacks(oldStack, newStack, comment):
"""Processes sentence's stacks - old sentences are matched to sentences from new sentence's stack"""
if(len(oldStack) == 0 or len(newStack) == 0):
return
oldStack = deque([x for x in oldStack if x not in newStack])
while len(oldStack) > 0:
oldSent = oldStack.popleft()
candidates = [(x, Utils.sentenceSimilarity(oldSent, x)) for x in newStack]
candidates = [(x, similarity) for (x, similarity) in candidates if similarity > context["sentenceTreshold"]]
if(len(candidates) > 0):
candidates = sorted(candidates, key=lambda candidate: candidate[1], reverse=True)
errorBuffer.append((comment, oldSent, candidates[0][0]))
def processRevisions(oldRev, newRev):
"""Compares two revisions and constructs old and new stacks of sentences for further processing"""
if(oldRev == None or newRev == None or oldRev["*"] == None or newRev["*"] == None):
return
oldStack = deque()
newStack = deque()
for line in difflib.unified_diff(oldRev["*"], newRev["*"]):
if line.startswith(' '): #Skip unnecessary output
continue
elif line.startswith('---'):
continue
elif line.startswith('+++'):
continue
elif line.startswith('-'): #Write diff lines from old revision to stack
oldStack.append(line[1:])
elif line.startswith('+'): #Write diff lines from new revision to stack
newStack.append(line[1:])
processStacks(oldStack, newStack, newRev["comment"])
def extract(page):
global errorBuffer
errorBuffer = []
if(len(page["revisions"]) > 0):
newRev = page["revisions"][0]
for i in range(1, len(page["revisions"])):
oldRev = newRev
newRev = page["revisions"][i]
if(newRev["comment"] != None):
processRevisions(oldRev, newRev)
resolveEvolution()
if(not context["allowNesting"]):
removeNestedErrors()
if len(errorBuffer) > 0:
page["errors"] = errorBuffer
else:
page["errors"] = []
else:
page["errors"] = []
return page