Blog Spot

Programa que permite una conversión del BlogSpot ( en su extracción XML ) a formato PMWiki

from lxml import etree
from datetime import datetime
from dateutil import parser
import copy
import re
import os
import requests
from bs4 import BeautifulSoup
def get_all_images(p_html):
if '<img' in p_html:
soup = BeautifulSoup(p_html, "html.parser")
# Crear carpeta para almacenar imágenes
os.makedirs("imagenes", exist_ok=True)
# Buscar todas las etiquetas <img>
for img_tag in soup.find_all("img"):
img_url = img_tag.get("src")
file_name = os.path.basename(img_url)
if not os.path.exists('imagenes/'+file_name):
if img_url and 'bergonzini' not in img_url and 'espacioblog' not in img_url \
and 'windsor' not in img_url:
# Obtener el nombre del archivo
img_name = os.path.basename(img_url)
img_path = os.path.join("imagenes", img_name)
# Descargar la imagen
try:
response = requests.get(img_url)
except:
with open(img_path, "w") as file:
file.write('error')
else:
if response.status_code == 200:
with open(img_path, "wb") as file:
file.write(response.content)
else:
with open(img_path, "w") as file:
file.write('error')
def sanitizar(p_str):
if p_str in (None, ''):
return p_str
else:
cx = p_str
cx = cx.replace('</p>',"")
cx = cx.replace('
',r'%0a')
cx = cx.replace('\\
',r'%0a')
cx = cx.replace('\\
',r'%0a')
cx = cx.replace('
! ','!')
cx = cx.replace('<h2>','!!')
cx = cx.replace('<h3>','!!!')
cx = cx.replace('<h4>','!!!!')
cx = cx.replace('
','')
cx = cx.replace('</div>','')
cx = cx.replace('</h2>','')
cx = cx.replace('</h3>','')
cx = cx.replace('</h4>','')
cx = cx.replace('
! ','!')
cx = cx.replace('<H2>','!!')
cx = cx.replace('<H3>','!!!')
cx = cx.replace('<H4>','!!!!')
cx = cx.replace('
','')
cx = cx.replace('</H2>','')
cx = cx.replace('</H3>','')
cx = cx.replace('</H4>','')
cx = cx.replace(''''',"'''")
cx = cx.replace(''''',"'''")
cx = cx.replace(''-',"[- ")
cx = cx.replace('-''," -]")
cx = cx.replace(''''',"'''")
cx = cx.replace(''''',"'''")
cx = cx.replace('''',"''")
cx = cx.replace('''',"''")
cx = cx.replace('''',"''")
cx = cx.replace('</span>',"")
cx = cx.replace('''',"''")
cx = cx.replace(';s:8:"category','')
cx = cx.replace(r'\n',r'%0a')
pattern = r'<img[^>]*src=["\'](.*?)["\'][^>]*>'
cx = re.sub(pattern, r" %width=50pct%\1 ", cx)
#cx = cx.replace('../images/','Path:/uploads/BLOG/')
cx = re.sub(r'<div[^>]+>','',cx)
cx = re.sub(r'<span [^>]+>','',cx)
cx = re.sub(r'<a [^>]*href="([^"]+)"[^>]*>(.*?)</a>', r' [[ 1 | 2 ]] ',cx)
#pattern = r'<as+href="([^"]+)"[^>]*>(.*?)</a>'
#cx = re.sub(pattern, r'[[\1|\2]]', cx)
return cx
# Cargar el archivo XML
tree = etree.parse('blog-05-12-2025.xml')
datos = []
tmp = {}
data_titulo = ''
data_fecha = ''
# Recorrer los elementos
for elem in tree.iter():
if elem.tag == '{http://www.w3.org/2005/Atom}entry':
for ele in elem:
if ele.tag == '{http://www.w3.org/2005/Atom}id':
if 'post' in ele.text:
tmp['id'] = ele.text
else:
break
if 'rel' in ele.attrib:
if ele.attrib['rel'] == 'alternate':
tmp['link'] = ele.attrib['href']
if ele.tag == '{http://www.w3.org/2005/Atom}published':
fecha_objeto = parser.parse(ele.text)
tmp['fecha'] = fecha_objeto.strftime("%Y%m%d %H:%M")
tmp['fichero'] = fecha_objeto.strftime("%Y%m%d%H%M")
if ele.tag == '{http://www.w3.org/2005/Atom}content':
# get_all_images(ele.text)
tmp['text'] = sanitizar(ele.text)
if ele.tag == '{http://www.w3.org/2005/Atom}title':
tmp['titulo'] = sanitizar(ele.text)
if ele.tag == '{http://www.w3.org/2005/Atom}author':
for el in ele:
if el.tag == '{http://www.w3.org/2005/Atom}name':
tmp['autor'] = el.text
if tmp:
if 'link' in tmp:
if 'showComment' not in tmp['link']:
datos.append(copy.deepcopy(tmp))
else:
datos.append(copy.deepcopy(tmp))
tmp.clear()
datos = sorted(datos, key=lambda x: x["fecha"])
# Descromprimimos lo leído
fmain = open('out/McRatas.BLOG',"w",encoding='utf-8')
fmaind = 'version=pmwiki-2.4.4 ordered=1 urlencoded=1' + os.linesep + \
'agent=Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0' + os.linesep + \
'charset=UTF-8' + os.linesep + \
'name=McRatas.BLOG' + os.linesep + \
'text=(:title Listado de publicaciones:)' + r'%0a' + r'%0a'
for elemento in datos:
fichero = 'McRatas.BLOG'+elemento['fichero']
fichero_origen = 'McRatas.BLOG'+elemento['fichero']
contador = 0
while os.path.exists('out/'+fichero):
fichero = fichero_origen + '_' + str(contador)
contador += 1
fin = open('out/'+fichero,'w', encoding='utf-8')
if elemento['titulo'] in ('', None, []):
elemento['titulo'] = 'Sin título '+ elemento["fecha"]
if 'link' in elemento:
if elemento['link'] in ('', None, []):
elemento['link'] = ''
else:
elemento['link'] = 'Enlace: [['+elemento['link']+' | ' + elemento['link']+' ]]'
else:
elemento['link'] = ''
elemento['text'] = elemento['text'] + os.linesep + \
'----' + os.linesep + \
'Publicado en : ' + elemento['fecha'] + os.linesep + \
'Por ' + elemento['autor'] + os.linesep + elemento['link']
elemento["text"] = re.sub(os.linesep,r'%0a',elemento["text"])
fint = 'version=pmwiki-2.4.4 ordered=1 urlencoded=1' + os.linesep + \
'agent=Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0' + os.linesep + \
'charset=UTF-8' + os.linesep + \
'name=' + fichero + os.linesep + \
'text=(:title '+ elemento["titulo"]+':)'+elemento["text"]
fin.write(fint)
fin.close()
fmaind = fmaind + '[['+fichero+' | '+ elemento['fecha']+']] '+ elemento["titulo"] + r'%0a'
fmain.write(fmaind)
fmain.close()

[$[Get Code]]