Source code for service.preclass.processors.ppt2text

import os
import sys

from pymongo import MongoClient
from bson import ObjectId

from pptx import Presentation

from utils import get_channel, get_logger, now
from config import MONGO
from data.lecture import insert_file_snippet

import base64


[docs]
def png_to_base64(png_file_path):
	"""
	Converts a PNG file to a Base64-encoded string.

	Parameters:
	png_file_path: The relative or absolute path to the PNG image file.

	Returns:
	A Base64-encoded string of the PNG image.
	"""
	try:
		# Open the PNG file in binary read mode
		with open(png_file_path, 'rb') as image_file:
			# Read the file's contents and encode it to Base64
			encoded_string = base64.b64encode(image_file.read())
		# Decode the Base64 bytes object to a string and return it
		return encoded_string.decode('utf-8')
	except FileNotFoundError:
		# Return an error message if the file is not found
		return "The specified file was not found. Please check the path is correct."
	except Exception as e:
		# Return a generic error message for any other exceptions
		return f"An error occurred during the conversion: {str(e)}"



[docs]
def extract_text_from_ppt(
	ppt_path: str,
	png_path: str,
	lecture_id: ObjectId,
	) -> None:
	"""Extract text and images from PowerPoint slides and store them in the database.

	Args:
		ppt_path (str): Path to the PowerPoint file
		png_path (str): Directory path where PNG images will be stored
		lecture_id (ObjectId): MongoDB ObjectId of the lecture

	The function processes each slide to:
	- Extract text content from shapes
	- Convert slide to PNG image
	- Store both text and image in the database
	"""

	# Load the presentation
	presentation = Presentation(ppt_path)

	for slide_number, slide in enumerate(presentation.slides):
		slide_dir = os.path.join(png_path, str(slide_number + 1))
		slide_dir = slide_dir.replace("\\", "/")

		content = ""
		png = f"{slide_dir}.png"
		png_base64 = png_to_base64(png)

		# Extract text
		for shape in slide.shapes:
			if shape.has_text_frame:
				for paragraph in shape.text_frame.paragraphs:
					content += paragraph.text + "\n"
				content += "\n"

		insert_file_snippet(
			idx=slide_number,
			content=content.strip(),
			png_base64=png_base64,
			lecture_id=lecture_id,
			file_type="pptx"
		)



[docs]
class SERVICE:
	"""Service class for handling PowerPoint to text conversion tasks.

	Static Attributes:

			_collection (MongoClient=preclass.ppt2text): MongoDB collection for storing job information
				.. :noindex:
			_queue_name (str): RabbitMQ queue name for the service. Default to `preclass-ppt2text`
				.. :noindex:
			_logger: Logger instance for the service
				.. :noindex:
	"""

	_collection = MongoClient(
		MONGO.HOST,
		MONGO.PORT
		).preclass.ppt2text
	_queue_name = "preclass-ppt2text"

	_logger = get_logger(
		__name__=__name__,
		__file__=__file__,
	)


[docs]
	@staticmethod
	def trigger(
			parent_service: str,
			lecture_id: ObjectId,
			parent_job_id: ObjectId
			) -> str:
		"""Trigger a new PPT to text conversion job.

		Args:
			parent_service (str): Name of the parent service
			lecture_id (ObjectId): MongoDB ObjectId of the lecture
			parent_job_id (ObjectId): MongoDB ObjectId of the parent job

		Returns:
			str: The job ID of the created conversion task
		"""
		connection, channel = get_channel(SERVICE._queue_name)
		
		SERVICE._logger.info("Pushing job to MONGO")
		
		job_id = SERVICE._collection.insert_one(
			dict(
				parent_service=parent_service,
				created_time = now(),
				lecture_id=lecture_id,
				parent_job_id=parent_job_id,
			)
		).inserted_id

		SERVICE._logger.info("Pushing job to RabbitMQ")
		channel.basic_publish(
			exchange="",
			routing_key=SERVICE._queue_name,
			body=str(job_id)
		)
		connection.close()
		
		SERVICE._logger.info("Job pushed to RabbitMQ")
		return job_id



[docs]
	@staticmethod
	def callback(ch, method, properties, body):
		"""Process PowerPoint conversion jobs from the RabbitMQ queue.

		Args:
			ch: RabbitMQ channel
			method: RabbitMQ method frame
			properties: RabbitMQ properties
			body: Message body containing the job ID
		"""
		job_id = ObjectId(body.decode())
		job = SERVICE._collection.find_one(dict(_id=job_id))
		lecture_id, parent_service, parent_job_id = job["lecture_id"], job["parent_service"], job["parent_job_id"]
		SERVICE._logger.debug(f"Recieved PreClass PPT2TEXT Job - {lecture_id}")

		extract_text_from_ppt(
			ppt_path=f"buffer/{lecture_id}/seed_file.pptx",
			png_path=f"buffer/{lecture_id}/pngs",
			lecture_id=lecture_id
			)

		SERVICE._collection.update_one(
			dict(_id=job_id),
			{"$set": dict(
				completion_time=now()
			)}
		)
		SERVICE._logger.info(f"Conversion Complete For {lecture_id}")
		parent_connection, parent_channel = get_channel(parent_service)
		parent_channel.basic_publish(
			exchange="",
			routing_key=parent_service,
			body=str(parent_job_id)
		)
		parent_connection.close()
		ch.basic_ack(delivery_tag = method.delivery_tag)



[docs]
	@staticmethod
	def launch_worker():
		"""Launch the worker to process PowerPoint conversion jobs.
		
		Starts consuming messages from the RabbitMQ queue and processes them.
		Can be terminated with CTRL+C.
		"""
		try:
			connection, channel = get_channel(SERVICE._queue_name)
			
			channel.basic_consume(
				queue=SERVICE._queue_name,
				on_message_callback=SERVICE.callback,
				auto_ack=False,
			)
			SERVICE._logger.info('Worker Launched. To exit press CTRL+C')
			channel.start_consuming()
		except KeyboardInterrupt:
			SERVICE._logger.warning('Shutting Off Worker')
			try:
				sys.exit(0)
			except SystemExit:
				os._exit(0)



if __name__ == "__main__":
	SERVICE._logger.warning("STARTING PRECLASS-PPTX2PDF SERVICE")
	SERVICE.launch_worker()