#!/usr/bin/env python # -*- coding: utf-8 -*- # Liste postes # Copyright (C) 2010 Jean-Marie Favreau # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License 3 # as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """Usage: liste-postes.py [options] Options: -s, --section=NB Numéro de la section (par défaut, 27) -t, --type=POSTE Type de poste. Exemple: "MCF" (défaut), "PR" -r, --repertoire=REP Répertoire où seront téléchargés les pdf de description des postes (par défaut, postes-publies) -h, --help Affiche cette aide """ import urllib2 import sys import re import os import shutil import getopt try: my_getopt = getopt.gnu_getopt except AttributeError: my_getopt = getopt.getopt from time import time from pdftools.pdffile import PDFDocument, PopGS, PushGS from pdftools.pdfpath import Path from pdftools.pdftext import Text def usage(): print __doc__ sys.exit(0) # liste des répertoires où sont stockés les fichiers liste_urls_dir = [ "https://www.galaxie.enseignementsup-recherche.gouv.fr/ensup/ListesPostesPublies/ANTEE/2010_1/", "https://www.galaxie.enseignementsup-recherche.gouv.fr/ensup/ListesPostesPublies/FIDIS/" ] def getPreviousDirectory(repertoire, date): previousDate = 0 for f in os.listdir(repertoire): date = 0 try: fdate = int(f) except: path if fdate != 0 and fdate != date: if previousDate < fdate: previousDate = fdate if previousDate != 0: return repertoire + os.sep + str(previousDate) else: return "" def isDate(text): return re.match("[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9]", text) != None def getMonth(text): return re.split("/", text)[1] def getDay(text): return re.split("/", text)[0] def getFile(postes, filename = ""): opener1 = urllib2.build_opener() pdfpostepage = opener1.open(postes) pdfpostes = pdfpostepage.read() if filename == "": filename = mktemp() fout = open(filename, "wb") fout.write(pdfpostes) fout.close() return filename def readPostes(filename, rep_destination, idSection, typeDePoste, precedent): doc = PDFDocument(filename, 1) inSection = False predClass = False values = [] datesfin = [] # load document for idpage in range(doc.count_pages()): page = doc.read_page(idpage + 1) content = page.read_contents() for c in content.contents: curClass = c.__class__ if curClass == PopGS and predClass == Path: inSection = True values.append([]) datesfin.append("") elif inSection: if curClass == list: values[-1].append("") for v in c: if v.__class__ == Text: values[-1][-1] += v.text if isDate(v.text): datesfin[-1] = v.text #else: print "Classe inconnue: ", c elif curClass == PushGS: inSection = False predClass = c.__class__ # compute the column that corresponds to the section idC = 0 for i in values[0]: if i == "Corps": corpsSection = idC if i == "Section": sectionColumn = idC if i == "Section2": sectionColumn2 = idC if i == "Section3": sectionColumn3 = idC idC += 1 if sectionColumn.__class__ != int: print "Erreur: pas de colonne section détectée." sys.exit(1) idCur = 0 nbNew = 0 for poste in values[1:]: section = 0 section2 = 0 section3 = 0 corps = "" try: date = datesfin[idCur] idCur += 1 msg = " " if int(getMonth(date)) < 3 or (int(getMonth(date)) == 3 and int(getDay(date)) < 15): msg = "!!!Attention!!! pour " msg += poste[1] corps = poste[corpsSection] section = int(poste[sectionColumn]) if sectionColumn2.__class__ == int: section2 = int(poste[sectionColumn2]) else: section2 = section if sectionColumn3.__class__ == int: section3 = int(poste[sectionColumn3]) else: section3 = section except: pass if (section == idSection or section2 == idSection or section3 == idSection) and corps == typeDePoste: nomPoste = rep_destination + os.sep + poste[2] + ".pdf" precedentNomPoste = precedent + os.sep + poste[2] + ".pdf" if precedent == "" or not os.path.exists(precedentNomPoste): nbNew += 1 reussite = False for prefixe in liste_urls_dir: url= prefixe + poste[0] + "/FOPC_" + poste[0] + "_" + poste[2] + ".pdf" try: getFile(url, nomPoste) print " Récupération de la fiche du poste ", poste[0], ",", poste[2], ",", date, ",", msg reussite = True break except: pass if not reussite: print " !!!Attention!!! Impossible de récupérer le fichier associé au poste", poste[0], ",", poste[2], ",", date, ",", msg else: shutil.copy(precedentNomPoste, nomPoste) print " Nombre de nouveaux:", nbNew ### début du script # récupération des informations de la ligne de commande try: opts, args = my_getopt(sys.argv[1:], "s:t:r:h", ["section=", "type=", "repertoire=", "help"]) except getopt.GetoptError, msg: print "Error: ", msg usage() sys.exit(2) repertoire = "postes-publies" section_voulue = 27 type_poste = "MCF" for o, a in opts: if o in ("-s", "--section"): try: section_voulue = int(a) except: print "Attention, la section doit être un entier" sys.exit(1) if o in ("-t", "--type"): type_poste = a if not type_poste in ("PR", "MCF"): print "Attention, possible que le type de poste soit inconnu:", type_poste if o in ("-r", "--repertoire"): repertoire = a if o in ("-h", "--help"): usage() sys.exit(0) if not os.path.exists(repertoire): print "Création du répertoire pour les postes" os.mkdir(repertoire) date = int(time()) repertoire_complet = repertoire + os.sep + str(date) precedent = getPreviousDirectory(repertoire, date) if os.path.exists(repertoire_complet): print "Attention, le répertoire", repertoire_complet, "existe déjà" print "Abandon" sys.exit(1) else: os.mkdir(repertoire_complet) print "> Nouveaux postes publiés" tmp1 = repertoire_complet + os.sep + "postes.pdf" url1 = "https://www.galaxie.enseignementsup-recherche.gouv.fr/ensup/ListesPostesPublies/Emplois_publies_TrieParCorps.pdf" getFile(url1, tmp1) readPostes(tmp1, repertoire_complet, section_voulue, type_poste, precedent) print "> Nouveaux postes pré-publiés" tmp2 = repertoire_complet + os.sep + "prepostes.pdf" url2 = "https://www.galaxie.enseignementsup-recherche.gouv.fr/ensup/ListesPostesPublies/Emplois_prepublies_TrieParCorps.pdf" getFile(url2, tmp2) readPostes(tmp2, repertoire_complet, section_voulue, type_poste, precedent)