Commit e3f95e9e authored by Gregor Zimmermann's avatar Gregor Zimmermann

final version with compared repositories (better hashing method needs to be found)

parent 3b358e9a
......@@ -24,7 +24,7 @@
<property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component>
<component name="RunManager" selected="Python.multithreading">
<component name="RunManager" selected="Python.compare">
<configuration name="Tokenizer" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="untitled2" />
<option name="INTERPRETER_OPTIONS" value="" />
......@@ -46,7 +46,7 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="g4_processer" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<configuration name="compare" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="untitled2" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
......@@ -58,7 +58,7 @@
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/g4_processer.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/compare.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
......@@ -67,7 +67,7 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="history" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<configuration name="dl_g4" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="untitled2" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
......@@ -75,11 +75,11 @@
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="WORKING_DIRECTORY" value="$USER_HOME$/Documents/Studium/MSR/msr_aeg/dl_script" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/history.py" />
<option name="SCRIPT_NAME" value="$USER_HOME$/Documents/Studium/MSR/msr_aeg/dl_script/dl_g4.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
......@@ -88,7 +88,7 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="loader" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<configuration name="history" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="untitled2" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
......@@ -100,7 +100,7 @@
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/loader.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/history.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
......@@ -132,11 +132,11 @@
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.compare" />
<item itemvalue="Python.multithreading" />
<item itemvalue="Python.dl_g4" />
<item itemvalue="Python.history" />
<item itemvalue="Python.Tokenizer" />
<item itemvalue="Python.loader" />
<item itemvalue="Python.g4_processer" />
</list>
</recent_temporary>
</component>
......@@ -166,25 +166,29 @@
<servers />
</component>
<component name="WindowStateProjectService">
<state width="1899" height="282" key="GridCell.Tab.0.bottom" timestamp="1585261727186">
<state x="690" y="268" key="#com.intellij.refactoring.safeDelete.UnsafeUsagesDialog" timestamp="1585303144976">
<screen x="0" y="0" width="1920" height="1040" />
</state>
<state x="690" y="268" key="#com.intellij.refactoring.safeDelete.UnsafeUsagesDialog/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1585303144976" />
<state width="1899" height="282" key="GridCell.Tab.0.bottom" timestamp="1585315347741">
<screen x="0" y="0" width="1920" height="1040" />
</state>
<state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1585261727186" />
<state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1585315347741" />
<state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1585245574243" />
<state width="1899" height="282" key="GridCell.Tab.0.center" timestamp="1585261727186">
<state width="1899" height="282" key="GridCell.Tab.0.center" timestamp="1585315347741">
<screen x="0" y="0" width="1920" height="1040" />
</state>
<state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1585261727186" />
<state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1585315347741" />
<state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1585245574243" />
<state width="1899" height="282" key="GridCell.Tab.0.left" timestamp="1585261727186">
<state width="1899" height="282" key="GridCell.Tab.0.left" timestamp="1585315347741">
<screen x="0" y="0" width="1920" height="1040" />
</state>
<state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1585261727186" />
<state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1585315347741" />
<state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1585245574243" />
<state width="1899" height="282" key="GridCell.Tab.0.right" timestamp="1585261727186">
<state width="1899" height="282" key="GridCell.Tab.0.right" timestamp="1585315347741">
<screen x="0" y="0" width="1920" height="1040" />
</state>
<state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1585261727186" />
<state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1585315347741" />
<state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1585245574243" />
<state width="1899" height="348" key="GridCell.Tab.1.bottom" timestamp="1585146236167">
<screen x="0" y="0" width="1920" height="1040" />
......
import pandas as pd
# gets a grammar returns a vector of rules
# testcases
#=======================================================================================================================
def hasher(frame):
occurence = frame['label'].value_counts()
if 'annotation' in occurence:
c_anno = occurence['annotation']
h_anno = hash(frame.loc[frame.label=='annotation'].name.sum())
else: c_anno, h_anno = 0,0 # count annotations
if 'tokenVocab' in occurence:
c_vocab = occurence['tokenVocab']
h_vocab = hash(frame.loc[frame.label=='tokenVocab'].name.sum())
else:
c_vocab, h_vocab = 0,0
if 'rule' in occurence:
c_rule = occurence['rule']
h_rule = hash(frame.loc[frame.label=='rule'].name.sum())#adden und hashen order insensitiv machen
else:
c_rule, h_rule = 0,0
if 'fragment' in occurence:
c_frag = occurence['fragment']
h_frag = hash(frame.loc[frame.label=='fragment'].name.sum())
else:
c_frag, h_frag = 0,0
if 'mode' in occurence:
c_mode = occurence['mode']
h_mode = hash(frame.loc[frame.label=='mode'].name.sum())
else:
c_mode, h_mode = 0, 0
vec = pd.DataFrame({'count': [frame.__len__(), #amount of elements
c_anno, #count annotations
c_vocab, #count vocab
c_rule, #count rules
c_frag, #fragments
c_mode],#mode
'hashed': [0, # amount of elements no hashing?!
h_anno, # hashed annotations
h_vocab, # hashed vocab
h_rule, # hashed rules
h_frag, # fragments
h_mode] #mode
},
['elem', 'anno', 'vocab', 'rules', 'fragments', 'mode'])
fir_col = vec['count'].sum()
sec_col = vec['hashed'].sum()
vec['count'] /= fir_col
vec['hashed'] /= sec_col
return vec
#=======================================================================================================================
#ToDo einfügen repo
def tokenizer(file):
#Step 1. cleaning of comments
file = open(file, "r",encoding="utf-8")
f = comment_cleaner(file.read())
file.close()
vector = pd.DataFrame(data={'label': [], 'name': []})
for line, next_line in zip(f, f[1:]):
# 1. check for grammar type lexer/parser/grammar ends with;
x = line.strip().split()
y = next_line.strip().split()
if 'grammar' in line:
vector = vector.append(pd.DataFrame(data={'label': [x[0]], 'name':[x[-1]]}))
if ':' in line:
if ':' == x[0]:
pass
if 'tokenVocab' in line:
vector = vector.append(pd.DataFrame(data={'label': ['tokenVocab'], 'name': [line]}))
# ToDo block building!?
# options nach tokenVocab=XYWRQWE
# 2. check for annotations @text {*? ?*}
if '@' in x[0][0]:
vector = vector.append(pd.DataFrame(data={'label': ['annotation'], 'name': [x[0][1:]]}))
# 3. check for rules -> ends with ;
# how to detect rules properly
# length of the rule order by name o.ä.
if ':' in line or y[0] == ':':
# 3.2 keywords like fragment mode see above extra rules
if x[0] == 'mode' or x[0] == 'fragment':
vector = vector.append(pd.DataFrame(data={'label': [x[0]], 'name': [x[1]]}))
elif y[0] == ':':
vector = vector.append(pd.DataFrame(data={'label': ['rule'], 'name': [x[0]]}))
elif x[0] != ':':
vector = vector.append(pd.DataFrame(data={'label': ['rule'], 'name': [x[0]]}))
#ToDo säubern :; , Hashen + normalizing
vector = hasher(vector)
return vector
#=======================================================================================================================
def comment_cleaner(f):
#delim = trennzeichen
delim_open = '/*'
delim_close = '*/'
delim_sl = '//'
#Memory probleme ahoi
while delim_open in f and delim_close in f:
op = f.index(delim_open)
cl = f.index(delim_close)+2
if cl <= op:
pre_comment = f[:cl]
past_comment = f[cl+2:]
f = pre_comment + past_comment
else:
pre_comment = f[:op]
past_comment = f[cl:]
f = pre_comment+past_comment
while delim_sl in f:
start = f.index(delim_sl)
if (f[start-1]=="'" and f[start+2]=="'") or (f[start-1]=='"' and f[start+2]=='"'):
pre_comment = f[:start-1]
past_comment = f[start+2:]
f = pre_comment + '--' + past_comment
else:
try:
end = f[start:].index('\n')+start #nächste newline
pre_comment = f[:start]
past_comment = f[end:]
f = pre_comment + past_comment
except:
f= f[:start]
f=f.splitlines()
f =[line.strip() for line in f]
while '' in f:
f.remove('')
return(f)
import pandas as pd
import threading
import time
file_usage = threading.Semaphore(value=1)
vec_grammar = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\Vectorized_Grammars.csv'
par_vec_grammar = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\Antlr_Parr.csv'
path = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\compare.csv'
timed_vec = pd.read_csv(vec_grammar)
parr_vec = pd.read_csv(par_vec_grammar)
#metrics close/identical by hash for rule hash(norm) close/identical by rule count
def compare(df_1dim):
closest_dir = parr_vec.loc[1]
for index, line in parr_vec.iterrows():
if abs(df_1dim['RuleHash']-line['RuleHash'])<=abs(df_1dim['RuleHash']-closest_dir['RuleHash']):
closest_dir = line
df = pd.DataFrame({'Name': [df_1dim['Name']],
'Type': [df_1dim['Type']],
'TokenVocab': [df_1dim['TokenVocab']],
'RuleCount': [df_1dim['RuleCount']],
'RuleHash': [df_1dim['RuleHash']],
'Repo': [df_1dim['RuleHash']],
'FileName': [df_1dim['FileName']],
'Date': [df_1dim['Date']],
'ParrGramName': [closest_dir['Name']],
'Diff':[abs(closest_dir['RuleHash']-df_1dim['RuleHash'])],
'ParrGramHash': [closest_dir['RuleHash']],
'ParrGramRuleCount': [closest_dir['RuleCount']]})
file_usage.acquire()
df.to_csv(path, mode='a', header=False)
file_usage.release()
def loaddistributor():
df = pd.DataFrame({'Name':[], 'Type':[], 'TokenVocab':[], 'RuleCount':[], 'RuleHash':[],'Repo':[],'FileName':[],'Date':[],'ParrGramName':[],'Diff':[], 'ParrGramHash':[],'ParrGramRuleCount':[]})
df.to_csv(path, mode='w')
x=0
for index, line in timed_vec.iterrows():
try:
working_thread = threading.Thread(target=compare, args=[line])
working_thread.start()
#time.sleep(0.1)
print(x)
x+=1
time.sleep(0.2)
except:
print('Thread couldnt be started')
loaddistributor()
\ No newline at end of file
#gets the dir with g4 files and computes them to a csv file with the vectors representing the grammars
import pandas as pd
import git
import tempfile
import shutil
import os
import pathlib
import pandas as pd
import Tokenizer
df_ref = pd.DataFrame([])
path = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\msr_aeg\\data_scource\\antlr_gram' # Terry Parr Grammar
out_path ='C:\\Users\\Admin\\Documents\\Studium\\MSR\\msr_aeg\\tokenizedTerPar.csv'
count = 0
output_df = pd.DataFrame({
'Repo-name':[],
'file-name':[],
'type':[], #hashed or counted
'elem':[],
'anno':[],
'vocab':[],
'rules':[],
'fragments':[],
'mode':[]
})
output_df.to_csv(out_path)
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.g4'):
#print('processing file :' + file)
scribble=Tokenizer.tokenizer(os.path.join(root, file))
output_df = pd.DataFrame({
'Repo-name':['TerrParr','TerrParr'],
'file-name':[file, file],
'type':['count', 'hashed'], #hashed or counted
'elem':[scribble['count'].elem, scribble['hashed'].elem],
'anno':[scribble['count'].anno, scribble['hashed'].anno],
'vocab':[scribble['count'].vocab, scribble['hashed'].vocab],
'rules':[scribble['count'].rules, scribble['hashed'].rules],
'fragments':[scribble['count'].fragments, scribble['hashed'].fragments],
'mode':[scribble['count']['mode'], scribble['hashed']['mode']]
})
output_df.to_csv(out_path, mode='a', header=False)
......@@ -6,7 +6,7 @@ import os
file = 'sample.csv'
path = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\Versioning\\'
raw_data = pd.read_csv(file)[190:]
raw_data = pd.read_csv(file)
for repo in raw_data['repository']:
print('repo '+ repo +' is getting checked')
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment