Giter Site home page Giter Site logo

Comments (30)

PuKoren avatar PuKoren commented on July 17, 2024 1

@raliste that sounds smart, I will give it a try. Thanks !

Update: thanks a lot, I used what you suggested with goroutines and I am down to 2.3s for 15 pages conversion at 200 DPI. Awesome.

from go-fitz.

PuKoren avatar PuKoren commented on July 17, 2024 1

@xiaoxfan sure

here is the code:

func ToPNGs(input []byte, opts *Options) ([][]byte, error) {
    var pngs [][]byte

    doc, err := fitz.NewFromMemory(input)
    defer doc.Close()

    if err != nil {
        return pngs, errors.WithStack(err)
    }

    numPage := doc.NumPage()
    pngs = make([][]byte, numPage)

    if err = assertNumPage(numPage); err != nil {
        return pngs, errors.WithStack(err)
    }

    // // SINGLE-THREADED
    // for n := 0; n < numPage; n++ {
    //     var png []byte

    //     // 72: 200ms, 144: 400ms, 200: 600ms, etc.
    //     png, err = doc.ImagePNG(n, opts.dpi())

    //     if err != nil {
    //         if logger != nil {
    //             // handle single page error here
    //         }
    //         continue
    //     }

    //     pngs[n] = png
    // }
    // END SINGLE-THREADED

    // MULTI-THREADED
    type imageResult struct {
        data *[]byte
        index int
    }

    chunks := createChunks(numPage, concurrency())
    ch := make(chan error, len(chunks))

    for _, chunk := range chunks {
        go func(chunk []int) {
            doc, err := fitz.NewFromMemory(input)

            if err != nil {
                ch <- errors.WithStack(err)
                return
            }

            for _, index := range chunk {

                buf, err := doc.ImagePNG(index, opts.dpi())

                if err != nil {
                    ch <- errors.WithStack(err)
                    return
                }

                pngs[index] = buf

            }

            doc.Close()
            ch <- nil
        }(chunk)
    }

    for _ = range chunks {
        if err := <-ch; err != nil {
            if logger != nil {
                // handle single page errors here
            }
        }
    }
    // END MULTI-THREADED

    return pngs, nil

}

It contains both single-threaded and multi-threaded versions that are separated with comments like // MULTI-THREADED
In this sample single-threaded mode is disabled

It takes a lot of memory but it is faster

from go-fitz.

xiaoxfan avatar xiaoxfan commented on July 17, 2024 1

@PuKoren
Thanks for your sharing. It's very helpful.

from go-fitz.

xiaoxfan avatar xiaoxfan commented on July 17, 2024 1

@PuKoren Hello, I'm so sorry to bother you again.
Below is my code. Some pdf work well and faster,but some pdf got segmentation violation error and looks like non-random.
I am very confused.Is there any way to solve this problem? thanks.
I use centos7

func Pdf2Images1(src []byte, dpi float64, pageLimit int) ([][]byte, error) {
	if dpi <= 0 {
		dpi = defaultDPI
	}
	doc, err := fitz.NewFromMemory(src)
	if err != nil {
		return nil, err
	}
	defer doc.Close()
	if pageLimit > 0 && doc.NumPage() > pageLimit {
		return nil, PageSizeErr
	}
	ret := make([][]byte, doc.NumPage())
	wg := new(sync.WaitGroup)
	wg.Add(doc.NumPage())
	for n := 0; n < doc.NumPage(); n++ {
		go func(n int) {
			defer wg.Done()
			doc, err := fitz.NewFromMemory(src)
			if err != nil {
				log.Println(err)
				return
			}
			defer doc.Close()
			ret[n], err = doc.ImagePNG(n, dpi)
			if err != nil {
				log.Println(err)
				return
			}
		}(n)
	}
	wg.Wait()
	return ret, nil
}
panic log
fatal error: unexpected signal during runtime execution
fatal error: unexpected signal during runtime execution
[signal SIGSEGV: segmentation violation code=0x1 addr=0x10 pc=0x93ba62]

runtime stack:
runtime.throw(0xc408b8, 0x2a)
        /usr/local/go/src/runtime/panic.go:1117 +0x72
runtime.sigpanic()
        /usr/local/go/src/runtime/signal_unix.go:718 +0x2e5

goroutine 30 [syscall]:
runtime.cgocall(0x8f7d00, 0xc0002e7610, 0xc0000241b0)
        /usr/local/go/src/runtime/cgocall.go:154 +0x5b fp=0xc0002e75e0 sp=0xc0002e75a8 pc=0x4328db
github.com/gen2brain/go-fitz._Cfunc_fz_run_page(0x7f6cb0000a10, 0x7f6cb0029ff0, 0x7f6cb002a2c0, 0xc0000241b0, 0x0)
        _cgo_gotypes.go:1383 +0x45 fp=0xc0002e7610 sp=0xc0002e75e0 pc=0x8e1d45
github.com/gen2brain/go-fitz.(*Document).ImagePNG.func10(0xc00018e000, 0x7f6cb0029ff0, 0x7f6cb002a2c0, 0xc0000241b0)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gen2brain/[email protected]/fitz.go:254 +0xc5 fp=0xc0002e7650 sp=0xc0002e7610 pc=0x8e3b85
github.com/gen2brain/go-fitz.(*Document).ImagePNG(0xc00018e000, 0x2, 0x4052000000000000, 0x0, 0x0, 0x0, 0x0, 0x0)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gen2brain/[email protected]/fitz.go:254 +0x32b fp=0xc0002e7710 sp=0xc0002e7650 pc=0x8e276b
split-pdf/util.Pdf2Images1.func1(0xc00031a030, 0xc000680000, 0x435042, 0x470000, 0xc00031a020, 0xc00038e000, 0x5, 0x5, 0x2)
        /home/xiaofan/workspace/go/split-pdf/util/pdf_linux.go:64 +0x12e fp=0xc0002e7798 sp=0xc0002e7710 pc=0x8f5d0e
runtime.goexit()
        /usr/local/go/src/runtime/asm_amd64.s:1371 +0x1 fp=0xc0002e77a0 sp=0xc0002e7798 pc=0x49af01
created by split-pdf/util.Pdf2Images1
        /home/xiaofan/workspace/go/split-pdf/util/pdf_linux.go:56 +0x20e

goroutine 1 [IO wait]:
internal/poll.runtime_pollWait(0x7f6ccac6aeb8, 0x72, 0x0)
        /usr/local/go/src/runtime/netpoll.go:222 +0x55
internal/poll.(*pollDesc).wait(0xc00038e198, 0x72, 0x0, 0x0, 0xc2bce2)
        /usr/local/go/src/internal/poll/fd_poll_runtime.go:87 +0x45
internal/poll.(*pollDesc).waitRead(...)
        /usr/local/go/src/internal/poll/fd_poll_runtime.go:92
internal/poll.(*FD).Accept(0xc00038e180, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
        /usr/local/go/src/internal/poll/fd_unix.go:401 +0x212
net.(*netFD).accept(0xc00038e180, 0x773bbd96961e7401, 0x0, 0x0)
        /usr/local/go/src/net/fd_unix.go:172 +0x45
net.(*TCPListener).accept(0xc000394180, 0x6081471f, 0xc00029fcc8, 0x4f3406)
        /usr/local/go/src/net/tcpsock_posix.go:139 +0x32
net.(*TCPListener).Accept(0xc000394180, 0xc00029fd18, 0x18, 0xc000000180, 0x6d541b)
        /usr/local/go/src/net/tcpsock.go:261 +0x65
net/http.(*Server).Serve(0xc0003d20e0, 0xcdf620, 0xc000394180, 0x0, 0x0)
        /usr/local/go/src/net/http/server.go:2981 +0x285
net/http.(*Server).ListenAndServe(0xc0003d20e0, 0xc0003d20e0, 0xc00029fec8)
        /usr/local/go/src/net/http/server.go:2910 +0xba
net/http.ListenAndServe(...)
        /usr/local/go/src/net/http/server.go:3164
github.com/gin-gonic/gin.(*Engine).Run(0xc0003c6340, 0xc00029ff58, 0x1, 0x1, 0x0, 0x0)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gin-gonic/[email protected]/gin.go:336 +0x1ba
main.main()
        /home/xiaofan/workspace/go/split-pdf/main.go:38 +0x3e5

goroutine 18 [semacquire]:
sync.runtime_Semacquire(0xc00031a038)
        /usr/local/go/src/runtime/sema.go:56 +0x45
sync.(*WaitGroup).Wait(0xc00031a030)
        /usr/local/go/src/sync/waitgroup.go:130 +0x65
split-pdf/util.Pdf2Images1(0xc000680000, 0x435042, 0x470000, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
        /home/xiaofan/workspace/go/split-pdf/util/pdf_linux.go:71 +0x247
main.SplitPDF(0xc000392200)
        /home/xiaofan/workspace/go/split-pdf/main.go:97 +0x46d
github.com/gin-gonic/gin.(*Context).Next(...)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gin-gonic/[email protected]/context.go:165
github.com/gin-gonic/gin.LoggerWithConfig.func1(0xc000392200)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gin-gonic/[email protected]/logger.go:241 +0xf4
github.com/gin-gonic/gin.(*Context).Next(...)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gin-gonic/[email protected]/context.go:165
github.com/gin-gonic/gin.CustomRecoveryWithWriter.func1(0xc000392200)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gin-gonic/[email protected]/recovery.go:99 +0x7a
github.com/gin-gonic/gin.(*Context).Next(...)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gin-gonic/[email protected]/context.go:165
github.com/gin-gonic/gin.(*Engine).handleHTTPRequest(0xc0003c6340, 0xc000392200)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gin-gonic/[email protected]/gin.go:489 +0x2aa
github.com/gin-gonic/gin.(*Engine).ServeHTTP(0xc0003c6340, 0xcdf800, 0xc0003d21c0, 0xc000392100)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gin-gonic/[email protected]/gin.go:445 +0x15c
net/http.serverHandler.ServeHTTP(0xc0003d20e0, 0xcdf800, 0xc0003d21c0, 0xc000392100)
        /usr/local/go/src/net/http/server.go:2887 +0xa3
net/http.(*conn).serve(0xc0003ac320, 0xce0aa0, 0xc00038c3c0)
        /usr/local/go/src/net/http/server.go:1952 +0x8cd
created by net/http.(*Server).Serve
        /usr/local/go/src/net/http/server.go:3013 +0x39b

goroutine 27 [IO wait]:
internal/poll.runtime_pollWait(0x7f6ccac6add0, 0x72, 0xffffffffffffffff)
        /usr/local/go/src/runtime/netpoll.go:222 +0x55
internal/poll.(*pollDesc).wait(0xc00038e218, 0x72, 0x0, 0x1, 0xffffffffffffffff)
        /usr/local/go/src/internal/poll/fd_poll_runtime.go:87 +0x45
internal/poll.(*pollDesc).waitRead(...)
        /usr/local/go/src/internal/poll/fd_poll_runtime.go:92
internal/poll.(*FD).Read(0xc00038e200, 0xc000382df1, 0x1, 0x1, 0x0, 0x0, 0x0)
        /usr/local/go/src/internal/poll/fd_unix.go:166 +0x1d5
net.(*netFD).Read(0xc00038e200, 0xc000382df1, 0x1, 0x1, 0x0, 0x0, 0x0)
        /usr/local/go/src/net/fd_posix.go:55 +0x4f
net.(*conn).Read(0xc0003860f8, 0xc000382df1, 0x1, 0x1, 0x0, 0x0, 0x0)
        /usr/local/go/src/net/net.go:183 +0x91
net/http.(*connReader).backgroundRead(0xc000382de0)
        /usr/local/go/src/net/http/server.go:692 +0x58
created by net/http.(*connReader).startBackgroundRead
        /usr/local/go/src/net/http/server.go:688 +0xd5

goroutine 28 [syscall]:
github.com/gen2brain/go-fitz._Cfunc_fz_run_page(0x7f6ca80008c0, 0x7f6ca8029480, 0x7f6ca8029750, 0xc00033e018, 0x0)
        _cgo_gotypes.go:1383 +0x45
github.com/gen2brain/go-fitz.(*Document).ImagePNG.func10(0xc00000e030, 0x7f6ca8029480, 0x7f6ca8029750, 0xc00033e018)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gen2brain/[email protected]/fitz.go:254 +0xc5
github.com/gen2brain/go-fitz.(*Document).ImagePNG(0xc00000e030, 0x0, 0x4052000000000000, 0x0, 0x0, 0x0, 0x0, 0x0)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gen2brain/[email protected]/fitz.go:254 +0x32b
split-pdf/util.Pdf2Images1.func1(0xc00031a030, 0xc000680000, 0x435042, 0x470000, 0xc00031a020, 0xc00038e000, 0x5, 0x5, 0x0)
        /home/xiaofan/workspace/go/split-pdf/util/pdf_linux.go:64 +0x12e
created by split-pdf/util.Pdf2Images1
        /home/xiaofan/workspace/go/split-pdf/util/pdf_linux.go:56 +0x20e

goroutine 29 [syscall]:
github.com/gen2brain/go-fitz._Cfunc_fz_run_page(0x7f6ca40008c0, 0x7f6ca40299a0, 0x7f6ca4029c70, 0xc0002bc2e8, 0x0)
        _cgo_gotypes.go:1383 +0x45
github.com/gen2brain/go-fitz.(*Document).ImagePNG.func10(0xc000394000, 0x7f6ca40299a0, 0x7f6ca4029c70, 0xc0002bc2e8)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gen2brain/[email protected]/fitz.go:254 +0xc5
github.com/gen2brain/go-fitz.(*Document).ImagePNG(0xc000394000, 0x1, 0x4052000000000000, 0x0, 0x0, 0x0, 0x0, 0x0)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gen2brain/[email protected]/fitz.go:254 +0x32b
split-pdf/util.Pdf2Images1.func1(0xc00031a030, 0xc000680000, 0x435042, 0x470000, 0xc00031a020, 0xc00038e000, 0x5, 0x5, 0x1)
        /home/xiaofan/workspace/go/split-pdf/util/pdf_linux.go:64 +0x12e
created by split-pdf/util.Pdf2Images1
        /home/xiaofan/workspace/go/split-pdf/util/pdf_linux.go:56 +0x20e

goroutine 31 [syscall]:
github.com/gen2brain/go-fitz._Cfunc_fz_run_page(0x7f6cb4000a10, 0x7f6cb402a510, 0x7f6cb402d330, 0xc000590018, 0x0)
        _cgo_gotypes.go:1383 +0x45
github.com/gen2brain/go-fitz.(*Document).ImagePNG.func10(0xc00058e000, 0x7f6cb402a510, 0x7f6cb402d330, 0xc000590018)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gen2brain/[email protected]/fitz.go:254 +0xc5
github.com/gen2brain/go-fitz.(*Document).ImagePNG(0xc00058e000, 0x3, 0x4052000000000000, 0x0, 0x0, 0x0, 0x0, 0x0)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gen2brain/[email protected]/fitz.go:254 +0x32b
split-pdf/util.Pdf2Images1.func1(0xc00031a030, 0xc000680000, 0x435042, 0x470000, 0xc00031a020, 0xc00038e000, 0x5, 0x5, 0x3)
        /home/xiaofan/workspace/go/split-pdf/util/pdf_linux.go:64 +0x12e
created by split-pdf/util.Pdf2Images1
        /home/xiaofan/workspace/go/split-pdf/util/pdf_linux.go:56 +0x20e

goroutine 32 [syscall]:
github.com/gen2brain/go-fitz._Cfunc_fz_run_page(0x7f6cc4029460, 0x7f6cc4053610, 0x7f6cc4055f10, 0xc000390048, 0x0)
        _cgo_gotypes.go:1383 +0x45 fp=0xc0002e8610 sp=0xc0002e85e0 pc=0x8e1d45
github.com/gen2brain/go-fitz.(*Document).ImagePNG.func10(0xc0003160c0, 0x7f6cc4053610, 0x7f6cc4055f10, 0xc000390048)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gen2brain/[email protected]/fitz.go:254 +0xc5 fp=0xc0002e8650 sp=0xc0002e8610 pc=0x8e3b85
github.com/gen2brain/go-fitz.(*Document).ImagePNG(0xc0003160c0, 0x4, 0x4052000000000000, 0x0, 0x0, 0x0, 0x0, 0x0)
        /home/xiaofan/workspace/go/pkg/mod/github.com/gen2brain/[email protected]/fitz.go:254 +0x32b fp=0xc0002e8710 sp=0xc0002e8650 pc=0x8e276b
split-pdf/util.Pdf2Images1.func1(0xc00031a030, 0xc000680000, 0x435042, 0x470000, 0xc00031a020, 0xc00038e000, 0x5, 0x5, 0x4)
        /home/xiaofan/workspace/go/split-pdf/util/pdf_linux.go:64 +0x12e fp=0xc0002e8798 sp=0xc0002e8710 pc=0x8f5d0e
created by split-pdf/util.Pdf2Images1
        /home/xiaofan/workspace/go/split-pdf/util/pdf_linux.go:56 +0x20e
[signal SIGSEGV: segmentation violation code=0x1 addr=0x10 pc=0x93ba62]

runtime stack:
runtime.throw(0xc408b8, 0x2a)
        /usr/local/go/src/runtime/panic.go:1117 +0x72
runtime.sigpanic()
        /usr/local/go/src/runtime/signal_unix.go:718 +0x2e5

from go-fitz.

TimRazumov avatar TimRazumov commented on July 17, 2024 1

Hello @xiaoxfan

I faced exactly the same problem. did you manage to find some solution?

from go-fitz.

gen2brain avatar gen2brain commented on July 17, 2024

Do you have any errors? Or example?

from go-fitz.

rondymesquita avatar rondymesquita commented on July 17, 2024

It seems I am facing the same problem. I just got the code from example on readme, and, inside the loop I added a go routine. I adapted some code and still not able to make it work. See my adapted coded below.

package main

import (
  "sync"
	"fmt"
	"image/jpeg"
	"image"
	"log"
	"os"
	"path/filepath"
	"github.com/gen2brain/go-fitz"
)
var tmpDir = "./output"

func createOutputFolder() {
	if _, err := os.Stat(tmpDir); !os.IsNotExist(err) {
		fmt.Println("Deleting existing folder")
		os.RemoveAll(tmpDir)
	}
	fmt.Println("Creating folder")
	os.Mkdir(tmpDir, os.ModePerm)
}

func extractImage(doc fitz.Document, number int) (image.Image){
	img, err := doc.Image(number)
	if err != nil {
		log.Fatal("Error while extracting image", err)
		// panic(err)
	}
	return img
}

func writeToFile(img image.Image, number int){
	name := fmt.Sprintf("test%03d.jpg", number)
	fmt.Println(name)
	f, err := os.Create(filepath.Join(tmpDir, name))
	defer f.Close()
	if err != nil {
		log.Fatal("Error while creating file")
		panic(err)
	}

	err = jpeg.Encode(f, img, &jpeg.Options{1})
	if err != nil {
		log.Fatal("Error while saving image")
		panic(err)
	}
}

func generate(doc fitz.Document, number int, wg sync.WaitGroup) {
	defer wg.Done()
	fmt.Println("Generating image from page", number)
	img := extractImage(doc, number)
	writeToFile(img, number)
}

func main() {
		doc, err := fitz.New("2pages.pdf")
		if err != nil {
			panic(err)
		}
		defer doc.Close()
		createOutputFolder()
                var wg sync.WaitGroup
		numPages := doc.NumPage()
		wg.Add(numPages)

		fmt.Println("===> Number of Pages", numPages)
                for number := 0; number < numPages; number++ {
    	                fmt.Println("=>", number)
			go generate(*doc, number, wg)
               }
               wg.Wait()
               fmt.Println("===> Done")
}

from go-fitz.

rondymesquita avatar rondymesquita commented on July 17, 2024

Also, I was using a very small document with 2 pages and text only (something like 80kb).

from go-fitz.

gen2brain avatar gen2brain commented on July 17, 2024

Methods are now protected with a mutex, so there should be no issues, but I didn't test.

from go-fitz.

rondymesquita avatar rondymesquita commented on July 17, 2024

I`m still not able to use goroutines...

from go-fitz.

PuKoren avatar PuKoren commented on July 17, 2024

Hello,

I think I experience a similar issue. While the code don't crash, it seems that my images are mixed together.

Here is a sample code I currently use for concurrency:

    // input is my PDF as an array of bytes
    doc, err := fitz.NewFromMemory(input)
    if err != nil {
        panic(err)
    }

    defer doc.Close()

    pages := make([]string, doc.NumPage())

    type ImageResult struct {
        base64img string
        index int
    }
    ch := make(chan ImageResult, doc.NumPage())

    for n := 0; n < doc.NumPage(); n++ {
        go func(index int) {
            img, err := doc.Image(index)
            if err != nil {
                ch <- ImageResult{ "", index }
                return
            }

            buf := new(bytes.Buffer)
            err = png.Encode(buf, img)

            if err != nil {
                ch <- ImageResult{ "", index }
                return
            }

            ch <- ImageResult{
              base64img: base64.StdEncoding.EncodeToString(buf.Bytes()),
              index: index,
            }
        }(n)
    }

    for n := 0; n < doc.NumPage(); n++ {
        res := <- ch
        pages[res.index] = res.base64img
    }

Without the goroutine it works fine, so I guess my code is OK.

Meanwhile I will try to look at the source code of the module and try to find the cause of the issue. I really need the performance boost of the goroutine (4 sec instead of > 12 on the PDF conversion, there is a lot of images) 😄

from go-fitz.

PuKoren avatar PuKoren commented on July 17, 2024

I added a PR (#12) to fix my issue, I'm not sure if this also fixes their issue

from go-fitz.

rondymesquita avatar rondymesquita commented on July 17, 2024

I could test and it seems to be working right now! Thanks @PuKoren. I will try to improve performance since when I test with a 12MB file, it breaks up.

Here the code

package main

import (
  "sync"
  "fmt"
  "image/jpeg"
  "log"
  "os"
  "path/filepath"
  "github.com/gen2brain/go-fitz"
  "reflect"
  "runtime"
)
var tmpDir = "./output"

func createOutputFolder() {
  if _, err := os.Stat(tmpDir); !os.IsNotExist(err) {
    fmt.Println("Deleting existing folder")
    os.RemoveAll(tmpDir)
  }
  fmt.Println("Creating folder")
  os.Mkdir(tmpDir, os.ModePerm)
}

func generate(doc *fitz.Document, number int) {
  fmt.Println("Generating image from page", number)
  img, err := doc.Image(number)
  if err != nil {
    log.Fatal("Error while extracting image", err)
    panic(err)
  }
  name := fmt.Sprintf("test%03d.jpg", number)

  f, err := os.Create(filepath.Join(tmpDir, name))
  defer f.Close()

  if err != nil {
    log.Fatal("Error while creating file")
    panic(err)
  }

  err = jpeg.Encode(f, img, &jpeg.Options{1})
  if err != nil {
    log.Fatal("Error while saving image")
    panic(err)
  }
  fmt.Println("Write finished", number)
}

func main() {
    runtime.GOMAXPROCS(2)
    doc, err := fitz.New("12MB.pdf")
    fmt.Println(reflect.TypeOf(doc))
    if err != nil {
      panic(err)
    }
    defer doc.Close()

    createOutputFolder()
    var wg sync.WaitGroup
    numPages := doc.NumPage()
    wg.Add(numPages)

    fmt.Println("===> Number of Pages", numPages)
    for number := 0; number < numPages; number++ {
      fmt.Println("=>", number)
      go func(doc *fitz.Document, number int, wg *sync.WaitGroup) {
        fmt.Println("Called")
        defer wg.Done()
        generate(doc, number)
      }(doc, number, &wg)
    }
   wg.Wait()
   fmt.Println("===> Done")
}

from go-fitz.

PuKoren avatar PuKoren commented on July 17, 2024

@rondymesquita my PDF is around 16MB and 15 pages, but I don't experience any crash. What is your machine specs ?

from go-fitz.

rondymesquita avatar rondymesquita commented on July 17, 2024

I guess it it not related with size itself but with PDF content. If it has to many images and/or is a PDF from scanned images, sometimes I get a crash. I'm using inside of docker container and I have limited memory and CPU. I did not mapped the error yet.

from go-fitz.

PuKoren avatar PuKoren commented on July 17, 2024

I stopped using goroutines for now as it was crashing randomly despite the fix submitted previously. It happens once every 10 convert approx.

Still trying to figure out how to make it goroutine-friendly as I really need the multi-threading speed

from go-fitz.

raliste avatar raliste commented on July 17, 2024

I ended up taking a process fork approach. Each Image() gets a short-lived process. At first sight it looks cumbersome and CPU-intensive, but after many approaches, we found it's the most stable, secure and fast, specially when you RPC. Running thousands of Image() per day under 1 second.

image

from go-fitz.

PuKoren avatar PuKoren commented on July 17, 2024

@raliste thanks for the update. I'm not sure I follow entirely, do you open the PDF with a go-fitz instance once in a separate process for each page inside the PDF ?

from go-fitz.

raliste avatar raliste commented on July 17, 2024

@PuKoren yes. There's a -fork flag that subprocess the same program with the args to the file, page and width.

from go-fitz.

gen2brain avatar gen2brain commented on July 17, 2024

I investigated a little how MuPDF works and what is needed to have real concurrency support, and there are some problems. What is needed is described here https://mupdf.com/docs/coding-overview.html#multi-threading, excerpt from there:

The following simple rules should be followed to ensure that multi-threaded operations run smoothly:

"No simultaneous calls to MuPDF in different threads are allowed to use the same context."

Most of the time it is simplest to just use a different context for every thread; just create a new context at the same time as you create the thread. For more details see "Cloning the context" below.

"No simultaneous calls to MuPDF in different threads are allowed to use the same document."

Only one thread can be accessing a document at a time, but once display lists are created from that document, multiple threads at a time can operate on them.

The document can be used from several different threads as long as there are safeguards in place to prevent the usages being simultaneous.

"No simultaneous calls to MuPDF in different threads are allowed to use the same device."

Calling a device simultaneously from different threads will cause it to get confused and may crash. Calling a device from several different threads is perfectly acceptable as long as there are safeguards in place to prevent the calls being simultaneous.

So context must be cloned, that is not a problem, but other rules are difficult to make right, especially not accessing the same document from other threads, but we can work with display lists in other threads (not sure what to do with them). I also tried to pass lock/unlock functions when creating context, and that works but goroutine is not the same as thread, probably you experienced crashes when some routine is started on a different thread.

There is an example here https://mupdf.com/docs/examples/multi-threaded.c, but for now, I don't see an easy way to apply that to Go. Any help or pointers are welcomed.

from go-fitz.

xiaoxfan avatar xiaoxfan commented on July 17, 2024

@raliste that sounds smart, I will give it a try. Thanks !

Update: thanks a lot, I used what you suggested with goroutines and I am down to 2.3s for 15 pages conversion at 200 DPI. Awesome.
@PuKoren
Hi.
Could you share the sulotion? Thanks.

from go-fitz.

PuKoren avatar PuKoren commented on July 17, 2024

@xiaoxfan maybe you can try to rename the variable inside the goroutine doc, err := fitz.NewFromMemory(src)

to something like doc2, err := fitz.NewFromMemory(src)
because doc is already declared outside of it, I'm unsure of the behavior 🤔

(its weird that in the code I pasted I also use the same naming)

sorry I realized I missed a function in the previous code:

func createChunks(n, max int) [][]int {
    is := make([]int, n)
    for i := 0; i < n; i++ {
        is[i] = i
    }
    if n <= max {
        max = n
    }
    size := n / max
    var chunks [][]int
    for i := 0; i < n; i += size {
        j := i + size
        if j > n {
            j = n
        }
        chunks = append(chunks, is[i:j])
    }
    return chunks
}

func concurrency() int {
    if v := os.Getenv("N_WORKERS"); v != "" {
        if n, err := strconv.Atoi(v); err == nil {
            return n
        }
    }
    return 2
}

Did you see a pattern in the files that are non-working vs the ones that are ok? Like the number of pages, the document size, etc.

from go-fitz.

xiaoxfan avatar xiaoxfan commented on July 17, 2024

@PuKoren Thank you for your reply,
this problem still exists, even if I use built-int func 'copy' to copy input byte slice for each goroutine. For most cases,it works fine,but a few specificIt pdf cause crash. That makes me very confused.

for n := 0; n < doc1.NumPage(); n++ {
		go func(n int) {
			defer wg.Done()
			src1 := make([]byte,len(src))
			copy(src1,src)
			doc, err := fitz.NewFromMemory(src1)
			if err != nil {
				log.Println(err)
				return
			}
			defer doc.Close()
			ret[n], err = doc.ImagePNG(n, dpi)
			if err != nil {
				log.Println(err)
				return
			}
		}(n)
	}

from go-fitz.

PuKoren avatar PuKoren commented on July 17, 2024

@xiaoxfan are those PDF working when processed without goroutines with a single gofitz instance?

from go-fitz.

xiaoxfan avatar xiaoxfan commented on July 17, 2024

@xiaoxfan are those PDF working when processed without goroutines with a single gofitz instance?

Yes

from go-fitz.

PuKoren avatar PuKoren commented on July 17, 2024

@xiaoxfan If you use the same code I pasted here you get the same issue?

from go-fitz.

xiaoxfan avatar xiaoxfan commented on July 17, 2024

@xiaoxfan If you use the same code I pasted here you get the same issue?

Yes, only a few specific pdf files (5 pages,no obvious difference from the others) will not work, the rest work good.
update: mupdf has a multi-thread example.but I don't know how to wrapper it with cgo.
https://mupdf.com/docs/examples/multi-threaded.c

from go-fitz.

PuKoren avatar PuKoren commented on July 17, 2024

@xiaoxfan I think it would be hard to debug without the PDF files to try locally
however I don't have enough time now to do it, so I can't help you much

from go-fitz.

xiaoxfan avatar xiaoxfan commented on July 17, 2024

@xiaoxfan I think it would be hard to debug without the PDF files to try locally
however I don't have enough time now to do it, so I can't help you much

@PuKoren
Thank you very much for your help, I will try to find other solutions.

from go-fitz.

gen2brain avatar gen2brain commented on July 17, 2024

Nice discussion in PyMUPDF pymupdf/PyMuPDF#97, but also without the solution.

from go-fitz.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.