Commit 212d94c3 authored by Gregor Zimmermann's avatar Gregor Zimmermann

Prerelease Scripts

parent b17ac59b
import pandas as pd
# gets a grammar returns a vector of rules
# testcases
#=======================================================================================================================
def hasher(frame):
occurence = frame['label'].value_counts()
if 'annotation' in occurence:
c_anno = occurence['annotation']
h_anno = hash(frame.loc[frame.label=='annotation'].name.sum())
else: c_anno, h_anno = 0,0 # count annotations
if 'tokenVocab' in occurence:
c_vocab = occurence['tokenVocab']
h_vocab = hash(frame.loc[frame.label=='tokenVocab'].name.sum())
else:
c_vocab, h_vocab = 0,0
if 'rule' in occurence:
c_rule = occurence['rule']
h_rule = hash(frame.loc[frame.label=='rule'].name.sum())#adden und hashen order insensitiv machen
else:
c_rule, h_rule = 0,0
if 'fragment' in occurence:
c_frag = occurence['fragment']
h_frag = hash(frame.loc[frame.label=='fragment'].name.sum())
else:
c_frag, h_frag = 0,0
if 'mode' in occurence:
c_mode = occurence['mode']
h_mode = hash(frame.loc[frame.label=='mode'].name.sum())
else:
c_mode, h_mode = 0, 0
vec = pd.DataFrame({'count': [frame.__len__(), #amount of elements
c_anno, #count annotations
c_vocab, #count vocab
c_rule, #count rules
c_frag, #fragments
c_mode],#mode
'hashed': [0, # amount of elements no hashing?!
h_anno, # hashed annotations
h_vocab, # hashed vocab
h_rule, # hashed rules
h_frag, # fragments
h_mode] #mode
},
['elem', 'anno', 'vocab', 'rules', 'fragments', 'mode'])
fir_col = vec['count'].sum()
sec_col = vec['hashed'].sum()
vec['count'] /= fir_col
vec['hashed'] /= sec_col
return vec
#=======================================================================================================================
#ToDo einfügen repo
def tokenizer(file):
#Step 1. cleaning of comments
file = open(file, "r",encoding="utf-8")
f = comment_cleaner(file.read())
file.close()
vector = pd.DataFrame(data={'label': [], 'name': []})
for line, next_line in zip(f, f[1:]):
# 1. check for grammar type lexer/parser/grammar ends with;
x = line.strip().split()
y = next_line.strip().split()
if 'grammar' in line:
vector = vector.append(pd.DataFrame(data={'label': [x[0]], 'name':[x[-1]]}))
if ':' in line:
if ':' == x[0]:
pass
if 'tokenVocab' in line:
vector = vector.append(pd.DataFrame(data={'label': ['tokenVocab'], 'name': [line]}))
# ToDo block building!?
# options nach tokenVocab=XYWRQWE
# 2. check for annotations @text {*? ?*}
if '@' in x[0][0]:
vector = vector.append(pd.DataFrame(data={'label': ['annotation'], 'name': [x[0][1:]]}))
# 3. check for rules -> ends with ;
# how to detect rules properly
# length of the rule order by name o.ä.
if ':' in line or y[0] == ':':
# 3.2 keywords like fragment mode see above extra rules
if x[0] == 'mode' or x[0] == 'fragment':
vector = vector.append(pd.DataFrame(data={'label': [x[0]], 'name': [x[1]]}))
elif y[0] == ':':
vector = vector.append(pd.DataFrame(data={'label': ['rule'], 'name': [x[0]]}))
elif x[0] != ':':
vector = vector.append(pd.DataFrame(data={'label': ['rule'], 'name': [x[0]]}))
#ToDo säubern :; , Hashen + normalizing
vector = hasher(vector)
return vector
#=======================================================================================================================
def comment_cleaner(f):
#delim = trennzeichen
delim_open = '/*'
delim_close = '*/'
delim_sl = '//'
#Memory probleme ahoi
while delim_open in f and delim_close in f:
op = f.index(delim_open)
cl = f.index(delim_close)+2
if cl <= op:
pre_comment = f[:cl]
past_comment = f[cl+2:]
f = pre_comment + past_comment
else:
pre_comment = f[:op]
past_comment = f[cl:]
f = pre_comment+past_comment
while delim_sl in f:
start = f.index(delim_sl)
if (f[start-1]=="'" and f[start+2]=="'") or (f[start-1]=='"' and f[start+2]=='"'):
pre_comment = f[:start-1]
past_comment = f[start+2:]
f = pre_comment + '--' + past_comment
else:
try:
end = f[start:].index('\n')+start #nächste newline
pre_comment = f[:start]
past_comment = f[end:]
f = pre_comment + past_comment
except:
f= f[:start]
f=f.splitlines()
f =[line.strip() for line in f]
while '' in f:
f.remove('')
return(f)
#gets the dir with g4 files and computes them to a csv file with the vectors representing the grammars
import pandas as pd
import git
import tempfile
import shutil
import os
import pathlib
import pandas as pd
import Tokenizer
df_ref = pd.DataFrame([])
path = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\msr_aeg\\data_scource\\antlr_gram' # Terry Parr Grammar
out_path ='C:\\Users\\Admin\\Documents\\Studium\\MSR\\msr_aeg\\tokenizedTerPar.csv'
count = 0
output_df = pd.DataFrame({
'Repo-name':[],
'file-name':[],
'type':[], #hashed or counted
'elem':[],
'anno':[],
'vocab':[],
'rules':[],
'fragments':[],
'mode':[]
})
output_df.to_csv(out_path)
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.g4'):
#print('processing file :' + file)
scribble=Tokenizer.tokenizer(os.path.join(root, file))
output_df = pd.DataFrame({
'Repo-name':['TerrParr','TerrParr'],
'file-name':[file, file],
'type':['count', 'hashed'], #hashed or counted
'elem':[scribble['count'].elem, scribble['hashed'].elem],
'anno':[scribble['count'].anno, scribble['hashed'].anno],
'vocab':[scribble['count'].vocab, scribble['hashed'].vocab],
'rules':[scribble['count'].rules, scribble['hashed'].rules],
'fragments':[scribble['count'].fragments, scribble['hashed'].fragments],
'mode':[scribble['count']['mode'], scribble['hashed']['mode']]
})
output_df.to_csv(out_path, mode='a', header=False)
import pandas as pd
import git
import tempfile
import os
file = 'sample.csv'
path = 'C:\\Users\\Admin\\Documents\\Studium\\MSR\\Versioning\\'
raw_data = pd.read_csv(file)[190:]
for repo in raw_data['repository']:
print('repo '+ repo +' is getting checked')
rep_add = 'https://github.com/' + repo
path_dump = path + ''.join(repo.split('/'))
os.mkdir(path_dump)
print(path_dump)
#creates a temporary dir
t=tempfile.mktemp()
try:
rep_loc = git.Repo.clone_from(rep_add, t)
for com in rep_loc.iter_commits():
for comPath in com.stats.files:
if comPath.endswith('.g4'):
try:
path_to_file = os.path.join(path_dump,comPath.split('/')[-1][:-3]+'-'+str(com.committed_date)+'.g4')
f = open(path_to_file, 'wb')
data = ((com.tree / comPath).data_stream.read())
f.write(data)
f.close()
except:
print('PROBLEM', 'Serached Path', comPath, 'Commit')
for entry in com.tree:
print('------------------------', entry.name)
except:
print('repo: ' + repo + 'was not able to be cloned')
\ No newline at end of file
import threading
import os
import time
from random import random
file_usage = threading.Semaphore(value=1)
def dummy_func(arg):
if arg==5:
time.sleep(50)
file_usage.acquire()
print('file-access'+ str(arg))
accumulated_file = open('C:\\Users\\Admin\\Documents\\Studium\\MSR\\test.txt', 'a')
accumulated_file.write(str(arg))
accumulated_file.close()
file_usage.release()
return 0
def loaddistribution(filepath):
#for root, dirs, files in os.walk(filepath):
for x in range(10):
test = threading.Thread(target=dummy_func, args=[x])
test.start()
print('thread '+ str(x)+ ' started')
#mengenbegrenzung?
loaddistribution(42)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment