#!/bin/sh #prerequis: lancer le serveur mongod (nohup sh /users/musk/nturenne/Scripts_Youtube/run_youtube/test/mongodb-hadoop.sh) #prerequis: lancer le client mongo (/projets/musk/Youtube/mongodb/mongodb_3.2.7/bin/mongo osirim-hadoop.irit.fr:28018) #tuer le serveur mongo proprement: /projets/musk/Youtube/mongodb/mongodb_3.2.7/bin/mongod --dbpath /projets/musk/Youtube/Youtube_MongoDB5/ --shutdown # repair: rm /projets/musk/Youtube/Youtube_MongoDB5/mongod.lock, et creer nouvelle base: /projets/musk/Youtube/mongo.conf #lister les process : ps -fux # # #PARAM # #SET REQUETEYOUTUBE="zadistes" REQUETEYOUTUBE="molecular biology" #SET REQUETEYOUTUBE="whistleblower" #SET KEYWORD1="zadiste" #SET KEYWORD2="" KEYWORD1="molecular" KEYWORD2="biology" #SET KEYWORD1="whistleblower" #SET KEYWORD2="" #SET BASEMONGO="bio" #SET COLLECTIONBRUTMONGO="corpus_bio.brut" #SET COLLECTIONMETAMONGO="corpus_bio.meta" #SET COLLECTIONTXTMONGO="corpus_bio.txt" #SET COLLECTIONLINGMONGO="corpus_bio.ling" BASEMONGO="zad" COLLECTIONBRUTMONGO="corpus_zad.brut" COLLECTIONMETAMONGO="corpus_zad.meta" COLLECTIONTXTMONGO="corpus_zad.txt" COLLECTIONLINGMONGO="corpus_zad.ling" FILESENTENCE=Dictionary_Sentence.conll FILETREETAGGER=output_tagparser.conll FILEMALTPARSER=output_treeparser.conll FILEFEATUREREL=output_REL_alfa.csv FILEFEATURENER=output_NER_alfa.csv FILEFEATURE2GRAM=output_2GRAM_alfa.csv FILEFEATURE1GRAM=output_1GRAM_alfa.csv FILEFEATURENG=output_NG_alfa.csv FILEIDVIDEO=xaa BLOCKIDVIDEO=200 IPHOST="osirim-hadoop.irit.fr " PORTHOST=28018 OS="lin" # CLIENTID2018="xxx" CLIENTSECRET2018="xxx" REFRECHTOKEN2018="xxx" # CLIENTID2017="xxx" CLIENTSECRET2017="xxx" REFRECHTOKEN2017="xxx" # CLIENTID2016="xxx" CLIENTSECRET2016="xxx" REFRECHTOKEN2016="xxx" # CLIENTID2015="xxx" CLIENTSECRET2015="xxx" REFRECHTOKEN2015="xxx" # CLIENTID2014="xxx" CLIENTSECRET2014="xxx" REFRECHTOKEN2014="xxx" # CLIENTID2013="xxx" CLIENTSECRET2013="xxx" REFRECHTOKEN2013="xxx" # CLIENTID2012="xxx" CLIENTSECRET2012="xxx" REFRECHTOKEN2012="xxx" # CLIENTID2011="xxx" CLIENTSECRET2011="xxx" REFRECHTOKEN2011="xxx" # CLIENTID2010="xxx" CLIENTSECRET2010="xxx" REFRECHTOKEN2010="xxx" # CLIENTID2009="xxx" CLIENTSECRET2009="xxx" REFRECHTOKEN2009="xxx" # CLIENTID2008="xxx" CLIENTSECRET2008="xxx" REFRECHTOKEN2008="xxx" # CLIENTID2007="xxx" CLIENTSECRET2007="xxx" REFRECHTOKEN2007="xxx" # PATHHADOOPHIVE=/users/musk/nturenne/script_rhadoop/ PATHHADOOPCMD=/usr/hdp/2.6.3.0-235/hadoop/bin PATHHADOOPHOME=/usr/hdp PATHHADOOPPREFIX=/usr/hdp PATHHADOOPSTREAMING=/usr/hdp/2.6.3.0-235/hadoop-mapreduce/hadoop-streaming-2.7.3.2.6.3.0-235.jar PATHYOUTUBE=/projets/musk/Youtube/run_youtube PATHRESU=/users/musk/nturenne/Scripts_Youtube/run_youtube/test PATHLINGUISTIQUE=/projets/musk/Youtube/ling_youtube PATHFREQ=/projets/musk/Youtube/ling_youtube/you_freq PATHCLASSIF=/projets/musk/Youtube/run_youtube PATHTOMALTPARSER=maltparser-1.8.1.jar PATHTOMORPHTAGGER=D:/Utilisateurs/turenne/Dropbox/_dev/Youtube/MorphTaggerArabe PATHTOTREETAGGER=/projets/musk/Youtube/ling_youtube/treetagger PATHTOABB=D:/Utilisateurs/turenne/Dropbox/_dev/Youtube/Zip/TreeTagger/lib/french-abbreviations PATHTOSTANFORD=/projets/musk/Youtube/ling_youtube/stanford_ner PATHTOJAVA=/logiciels/java1.8/bin/ # CHEMINSCRIPT="/users/musk/nturenne/Scripts_Youtube/run_youtube/test" os="lin" dirInput="/users/musk/nturenne/Scripts_Youtube/run_youtube/test/" dirHDFSInput="/tmp/testZ" dirHDFSOutput="/tmp/outputt" dirScriptRHadoop="/users/musk/nturenne/script_rhadoop/MapRed-wordcount-unix/" JobName="Experiment" NBWorker=12 # #---------------------------------------------- # youcraw : telechargement des document youtube (collection brut) # cd %PATHYOUTUBE% # #set time-avant=%time% sh all_youcraw_2007.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2007% %CLIENTSECRET2007% %REFRECHTOKEN2007% %PATHRESU% sh all_youcraw_2008.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2008% %CLIENTSECRET2008% %REFRECHTOKEN2008% %PATHRESU% sh all_youcraw_2009.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2009% %CLIENTSECRET2009% %REFRECHTOKEN2009% %PATHRESU% sh all_youcraw_2010.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2010% %CLIENTSECRET2010% %REFRECHTOKEN2010% %PATHRESU% sh all_youcraw_2011.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2011% %CLIENTSECRET2011% %REFRECHTOKEN2011% %PATHRESU% sh all_youcraw_2012.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2012% %CLIENTSECRET2012% %REFRECHTOKEN2012% %PATHRESU% sh all_youcraw_2013.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2013% %CLIENTSECRET2013% %REFRECHTOKEN2013% %PATHRESU% sh all_youcraw_2014.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2014% %CLIENTSECRET2014% %REFRECHTOKEN2014% %PATHRESU% sh all_youcraw_2015.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2015% %CLIENTSECRET2015% %REFRECHTOKEN2015% %PATHRESU% sh all_youcraw_2016.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2016% %CLIENTSECRET2016% %REFRECHTOKEN2016% %PATHRESU% sh all_youcraw_2017.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2017% %CLIENTSECRET2017% %REFRECHTOKEN2017% %PATHRESU% sh all_youcraw_2018.sh "%REQUETEYOUTUBE%" %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %CLIENTID2018% %CLIENTSECRET2018% %REFRECHTOKEN2018% %PATHRESU% set time-apres=%time% REM REM---------------------------------------------- REM filtercollection : cree une collection qui contient les mots cles (2 max) (collection meta) REM cd %PATHYOUTUBE% REM set time-avant=%time% perl FilterCollection.pl %OS% %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONBRUTMONGO% %KEYWORD1% %KEYWORD2% %COLLECTIONMETAMONGO% set time-apres=%time% REM REM---------------------------------------------- REM IdBuilder : cree la liste des id de video REM cd %PATHLINGUISTIQUE% REM set time-avant=%time% python getIDS.py %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONMETAMONGO% > %PATHRESU%%FILEIDVIDEO% set time-apres=%time% REM REM---------------------------------------------- REM IdSplitter : cree 19 fichiers d' id de videos pour paralleliser REM mkdir %PATHRESU%folder_ids REM set time-avant=%time% wc -l D:\Utilisateurs\turenne\Dropbox\_dev\TestCrawl\xaa | cut -f1 -d' ' > D:\Utilisateurs\turenne\Dropbox\_dev\TestCrawl\wc.txt SET /P number= %PATHRESU%%FILESENTENCE% set time-apres=%time% REM REM---------------------------------------------- REM SentenceGenerator : genere une liste de phrases pour chaque titre+description+transcription (collection txt) REM cd %PATHLINGUISTIQUE% REM sh all_genererCorpus_xaa.sh %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONMETAMONGO% %COLLECTIONTXTMONGO% %PATHRESU% %FILESENTENCE% set time-apres=%time% REM REM---------------------------------------------- REM FeatureExtractor : genere les features linguistiques (collection ling) REM cd %PATHLINGUISTIQUE% REM set time-avant=%time% sh all_genererfeature_xaa.sh %IPHOST% %PORTHOST% %BASEMONGO% %COLLECTIONTXTMONGO% %PATHRESU%%FILETREETAGGER% %PATHRESU%%FILEMALTPARSER% %PATHRESU% %COLLECTIONLINGMONGO% %PATHTOMALTPARSER% %PATHTOMORPHTAGGER% %PATHTOTREETAGGER% %PATHTOABB% %PATHTOSTANFORD% %PATHTOJAVA% set time-apres=%time% REM REM---------------------------------------------- REM DocHisto : genere un histogramme des documents au cours des annees REM cd %PATHFREQ% REM set time-avant=%time% Rscript CalculHistodocR.txt %OS% %PATHHADOOPHIVE% %PATHHADOOPCMD% %PATHHADOOPHOME% %PATHHADOOPPREFIX% %PATHHADOOPSTREAMING% %PATHRESU% %BASEMONGO% %COLLECTIONMETAMONGO% set time-apres=%time% REM REM---------------------------------------------- REM FreqRanker : genere une liste triees (alpha et occurence) des features REM cd %PATHFREQ% REM set time-avant=%time% Rscript CalculFreqR.txt %OS% %PATHHADOOPHIVE% %PATHHADOOPCMD% %PATHHADOOPHOME% %PATHHADOOPPREFIX% %PATHHADOOPSTREAMING% %PATHRESU% %BASEMONGO% %COLLECTIONLINGMONGO% set time-apres=%time% REM REM---------------------------------------------- REM CreatMat : genere les matrices d'entree REM cd %PATHCLASSIF% REM set time-avant=%time% perl CreateMatrix.pl %OS% %BASEMONGO% %COLLECTIONTXTMONGO% %COLLECTIONMETAMONGO% %FILEFEATUREREL% %FILEFEATURENER% %FILEFEATURE2GRAM% %FILEFEATURE1GRAM% %FILEFEATURENG% %PATHRESU% set time-apres=%time% REM REM---------------------------------------------- REM CalculKM : genere des classes par la methode des k-means REM cd %PATHCLASSIF% REM set time-avant=%time% Rscript CalculKmR.txt %OS% %PATHHADOOPHIVE% %PATHHADOOPCMD% %PATHHADOOPHOME% %PATHHADOOPPREFIX% %PATHHADOOPSTREAMING% %PATHRESU% set time-apres=%time% REM REM execution time REM---------------------------------------------- :: Mission accomplie set hv=%time-avant:~0,2% set mv=%time-avant:~3,2% set sv=%time-avant:~6,2% set /a timeAvantSec=%hv%*3600+%mv%*60+%sv% echo %timeAvantSec% %hv% %mv% %sv% set hp=%time-apres:~0,2% set mp=%time-apres:~3,2% set sp=%time-apres:~6,2% set /a timeApresSec=%hp%*3600+%mp%*60+%sp% echo %timeApresSec% %hp% %mp% %sp% set /a Diff=timeApresSec-timeAvantSec set /a H=Diff / 3600 set /a M=(Diff % 3600 ) / 60 set /a S=Diff % 60 echo la commande a pris %H%:%M%:%S% (total %Diff% sec) 3h443s 2s 0s 1s 1min6 1h36min3s